diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,16525 +1,34 @@ { - "best_global_step": 2090, - "best_metric": 0.05588439479470253, - "best_model_checkpoint": "./results/checkpoint-2090", - "epoch": 42.0, + "best_global_step": 1, + "best_metric": 0.2202187031507492, + "best_model_checkpoint": "./results/checkpoint-1", + "epoch": 1.0, "eval_steps": 500, - "global_step": 2310, + "global_step": 1, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.01818181818181818, - "grad_norm": 37.05387496948242, + "epoch": 1.0, + "grad_norm": 6.83254861831665, "learning_rate": 1e-05, - "loss": 0.5272, + "loss": 0.0914, "step": 1 }, - { - "epoch": 0.03636363636363636, - "grad_norm": 0.18264199793338776, - "learning_rate": 9.996363636363638e-06, - "loss": 0.1346, - "step": 2 - }, - { - "epoch": 0.05454545454545454, - "grad_norm": 21.658315658569336, - "learning_rate": 9.992727272727274e-06, - "loss": 0.2743, - "step": 3 - }, - { - "epoch": 0.07272727272727272, - "grad_norm": 19.92592430114746, - "learning_rate": 9.98909090909091e-06, - "loss": 0.2512, - "step": 4 - }, - { - "epoch": 0.09090909090909091, - "grad_norm": 6.347537040710449, - "learning_rate": 9.985454545454546e-06, - "loss": 0.143, - "step": 5 - }, - { - "epoch": 0.10909090909090909, - "grad_norm": 13.259781837463379, - "learning_rate": 9.981818181818183e-06, - "loss": 0.1792, - "step": 6 - }, - { - "epoch": 0.12727272727272726, - "grad_norm": 19.668493270874023, - "learning_rate": 9.97818181818182e-06, - "loss": 0.2479, - "step": 7 - }, - { - "epoch": 0.14545454545454545, - "grad_norm": 16.110191345214844, - "learning_rate": 9.974545454545456e-06, - "loss": 0.1929, - "step": 8 - }, - { - "epoch": 0.16363636363636364, - "grad_norm": 8.150792121887207, - "learning_rate": 9.970909090909093e-06, - "loss": 0.1383, - "step": 9 - }, - { - "epoch": 0.18181818181818182, - "grad_norm": 4.64362907409668, - "learning_rate": 9.967272727272728e-06, - "loss": 0.1193, - "step": 10 - }, - { - "epoch": 0.2, - "grad_norm": 5.752618789672852, - "learning_rate": 9.963636363636364e-06, - "loss": 0.1392, - "step": 11 - }, - { - "epoch": 0.21818181818181817, - "grad_norm": 5.742602825164795, - "learning_rate": 9.960000000000001e-06, - "loss": 0.1566, - "step": 12 - }, - { - "epoch": 0.23636363636363636, - "grad_norm": 1.4384528398513794, - "learning_rate": 9.956363636363638e-06, - "loss": 0.146, - "step": 13 - }, - { - "epoch": 0.2545454545454545, - "grad_norm": 0.7851634621620178, - "learning_rate": 9.952727272727275e-06, - "loss": 0.1353, - "step": 14 - }, - { - "epoch": 0.2727272727272727, - "grad_norm": 4.292611122131348, - "learning_rate": 9.94909090909091e-06, - "loss": 0.1168, - "step": 15 - }, - { - "epoch": 0.2909090909090909, - "grad_norm": 0.22542831301689148, - "learning_rate": 9.945454545454546e-06, - "loss": 0.1366, - "step": 16 - }, - { - "epoch": 0.3090909090909091, - "grad_norm": 0.19275131821632385, - "learning_rate": 9.941818181818183e-06, - "loss": 0.1232, - "step": 17 - }, - { - "epoch": 0.32727272727272727, - "grad_norm": 3.903566598892212, - "learning_rate": 9.93818181818182e-06, - "loss": 0.1397, - "step": 18 - }, - { - "epoch": 0.34545454545454546, - "grad_norm": 0.7843319773674011, - "learning_rate": 9.934545454545455e-06, - "loss": 0.1255, - "step": 19 - }, - { - "epoch": 0.36363636363636365, - "grad_norm": 0.6306525468826294, - "learning_rate": 9.930909090909091e-06, - "loss": 0.1214, - "step": 20 - }, - { - "epoch": 0.38181818181818183, - "grad_norm": 1.7508517503738403, - "learning_rate": 9.927272727272728e-06, - "loss": 0.1013, - "step": 21 - }, - { - "epoch": 0.4, - "grad_norm": 2.8019816875457764, - "learning_rate": 9.923636363636365e-06, - "loss": 0.1345, - "step": 22 - }, - { - "epoch": 0.41818181818181815, - "grad_norm": 1.256691575050354, - "learning_rate": 9.920000000000002e-06, - "loss": 0.1454, - "step": 23 - }, - { - "epoch": 0.43636363636363634, - "grad_norm": 2.150932788848877, - "learning_rate": 9.916363636363637e-06, - "loss": 0.1242, - "step": 24 - }, - { - "epoch": 0.45454545454545453, - "grad_norm": 0.38700780272483826, - "learning_rate": 9.912727272727273e-06, - "loss": 0.1139, - "step": 25 - }, - { - "epoch": 0.4727272727272727, - "grad_norm": 2.0877628326416016, - "learning_rate": 9.90909090909091e-06, - "loss": 0.1128, - "step": 26 - }, - { - "epoch": 0.4909090909090909, - "grad_norm": 1.6190087795257568, - "learning_rate": 9.905454545454547e-06, - "loss": 0.1188, - "step": 27 - }, - { - "epoch": 0.509090909090909, - "grad_norm": 2.19464111328125, - "learning_rate": 9.901818181818182e-06, - "loss": 0.125, - "step": 28 - }, - { - "epoch": 0.5272727272727272, - "grad_norm": 0.4447917640209198, - "learning_rate": 9.898181818181818e-06, - "loss": 0.1274, - "step": 29 - }, - { - "epoch": 0.5454545454545454, - "grad_norm": 2.8872969150543213, - "learning_rate": 9.894545454545455e-06, - "loss": 0.1133, - "step": 30 - }, - { - "epoch": 0.5636363636363636, - "grad_norm": 4.072057247161865, - "learning_rate": 9.890909090909092e-06, - "loss": 0.1499, - "step": 31 - }, - { - "epoch": 0.5818181818181818, - "grad_norm": 1.1567100286483765, - "learning_rate": 9.887272727272727e-06, - "loss": 0.1098, - "step": 32 - }, - { - "epoch": 0.6, - "grad_norm": 3.222627878189087, - "learning_rate": 9.883636363636364e-06, - "loss": 0.1184, - "step": 33 - }, - { - "epoch": 0.6181818181818182, - "grad_norm": 3.7605419158935547, - "learning_rate": 9.88e-06, - "loss": 0.1195, - "step": 34 - }, - { - "epoch": 0.6363636363636364, - "grad_norm": 2.9499497413635254, - "learning_rate": 9.876363636363637e-06, - "loss": 0.1391, - "step": 35 - }, - { - "epoch": 0.6545454545454545, - "grad_norm": 1.1921433210372925, - "learning_rate": 9.872727272727274e-06, - "loss": 0.0916, - "step": 36 - }, - { - "epoch": 0.6727272727272727, - "grad_norm": 5.03579044342041, - "learning_rate": 9.86909090909091e-06, - "loss": 0.1215, - "step": 37 - }, - { - "epoch": 0.6909090909090909, - "grad_norm": 2.758638620376587, - "learning_rate": 9.865454545454545e-06, - "loss": 0.1208, - "step": 38 - }, - { - "epoch": 0.7090909090909091, - "grad_norm": 1.5219608545303345, - "learning_rate": 9.861818181818182e-06, - "loss": 0.1226, - "step": 39 - }, - { - "epoch": 0.7272727272727273, - "grad_norm": 0.18038880825042725, - "learning_rate": 9.858181818181819e-06, - "loss": 0.1107, - "step": 40 - }, - { - "epoch": 0.7454545454545455, - "grad_norm": 2.1407017707824707, - "learning_rate": 9.854545454545456e-06, - "loss": 0.1049, - "step": 41 - }, - { - "epoch": 0.7636363636363637, - "grad_norm": 5.125903129577637, - "learning_rate": 9.850909090909092e-06, - "loss": 0.1316, - "step": 42 - }, - { - "epoch": 0.7818181818181819, - "grad_norm": 2.549814224243164, - "learning_rate": 9.847272727272727e-06, - "loss": 0.1195, - "step": 43 - }, - { - "epoch": 0.8, - "grad_norm": 2.8272857666015625, - "learning_rate": 9.843636363636364e-06, - "loss": 0.1068, - "step": 44 - }, - { - "epoch": 0.8181818181818182, - "grad_norm": 2.130585193634033, - "learning_rate": 9.84e-06, - "loss": 0.0982, - "step": 45 - }, - { - "epoch": 0.8363636363636363, - "grad_norm": 4.136003494262695, - "learning_rate": 9.836363636363637e-06, - "loss": 0.1349, - "step": 46 - }, - { - "epoch": 0.8545454545454545, - "grad_norm": 3.19783878326416, - "learning_rate": 9.832727272727274e-06, - "loss": 0.0976, - "step": 47 - }, - { - "epoch": 0.8727272727272727, - "grad_norm": 0.5724593997001648, - "learning_rate": 9.82909090909091e-06, - "loss": 0.1059, - "step": 48 - }, - { - "epoch": 0.8909090909090909, - "grad_norm": 6.178801536560059, - "learning_rate": 9.825454545454546e-06, - "loss": 0.1259, - "step": 49 - }, - { - "epoch": 0.9090909090909091, - "grad_norm": 6.855656623840332, - "learning_rate": 9.821818181818182e-06, - "loss": 0.1143, - "step": 50 - }, - { - "epoch": 0.9272727272727272, - "grad_norm": 3.7899281978607178, - "learning_rate": 9.81818181818182e-06, - "loss": 0.1213, - "step": 51 - }, - { - "epoch": 0.9454545454545454, - "grad_norm": 3.714876890182495, - "learning_rate": 9.814545454545456e-06, - "loss": 0.1085, - "step": 52 - }, - { - "epoch": 0.9636363636363636, - "grad_norm": 5.582960605621338, - "learning_rate": 9.810909090909093e-06, - "loss": 0.1057, - "step": 53 - }, - { - "epoch": 0.9818181818181818, - "grad_norm": 4.269004821777344, - "learning_rate": 9.80727272727273e-06, - "loss": 0.1052, - "step": 54 - }, { "epoch": 1.0, - "grad_norm": 4.530034065246582, - "learning_rate": 9.803636363636364e-06, - "loss": 0.1091, - "step": 55 - }, - { - "epoch": 1.0, - "eval_loss": 0.10722891986370087, - "eval_runtime": 9.3372, - "eval_samples_per_second": 583.149, - "eval_steps_per_second": 72.934, - "step": 55 - }, - { - "epoch": 1.018181818181818, - "grad_norm": 4.766411304473877, - "learning_rate": 9.800000000000001e-06, - "loss": 0.0959, - "step": 56 - }, - { - "epoch": 1.0363636363636364, - "grad_norm": 1.5101197957992554, - "learning_rate": 9.796363636363638e-06, - "loss": 0.1087, - "step": 57 - }, - { - "epoch": 1.0545454545454545, - "grad_norm": 3.720600128173828, - "learning_rate": 9.792727272727274e-06, - "loss": 0.0782, - "step": 58 - }, - { - "epoch": 1.0727272727272728, - "grad_norm": 9.043734550476074, - "learning_rate": 9.78909090909091e-06, - "loss": 0.1202, - "step": 59 - }, - { - "epoch": 1.0909090909090908, - "grad_norm": 6.023186206817627, - "learning_rate": 9.785454545454546e-06, - "loss": 0.0953, - "step": 60 - }, - { - "epoch": 1.1090909090909091, - "grad_norm": 1.1146883964538574, - "learning_rate": 9.781818181818183e-06, - "loss": 0.0891, - "step": 61 - }, - { - "epoch": 1.1272727272727272, - "grad_norm": 2.238832712173462, - "learning_rate": 9.77818181818182e-06, - "loss": 0.1052, - "step": 62 - }, - { - "epoch": 1.1454545454545455, - "grad_norm": 0.548115074634552, - "learning_rate": 9.774545454545456e-06, - "loss": 0.0902, - "step": 63 - }, - { - "epoch": 1.1636363636363636, - "grad_norm": 6.493234634399414, - "learning_rate": 9.770909090909091e-06, - "loss": 0.1177, - "step": 64 - }, - { - "epoch": 1.1818181818181819, - "grad_norm": 4.756880760192871, - "learning_rate": 9.767272727272728e-06, - "loss": 0.0938, - "step": 65 - }, - { - "epoch": 1.2, - "grad_norm": 2.2745614051818848, - "learning_rate": 9.763636363636365e-06, - "loss": 0.1004, - "step": 66 - }, - { - "epoch": 1.2181818181818183, - "grad_norm": 5.856293678283691, - "learning_rate": 9.760000000000001e-06, - "loss": 0.1014, - "step": 67 - }, - { - "epoch": 1.2363636363636363, - "grad_norm": 9.506874084472656, - "learning_rate": 9.756363636363636e-06, - "loss": 0.1131, - "step": 68 - }, - { - "epoch": 1.2545454545454544, - "grad_norm": 9.665825843811035, - "learning_rate": 9.752727272727273e-06, - "loss": 0.1196, - "step": 69 - }, - { - "epoch": 1.2727272727272727, - "grad_norm": 3.3099265098571777, - "learning_rate": 9.74909090909091e-06, - "loss": 0.115, - "step": 70 - }, - { - "epoch": 1.290909090909091, - "grad_norm": 6.175958633422852, - "learning_rate": 9.745454545454547e-06, - "loss": 0.1081, - "step": 71 - }, - { - "epoch": 1.309090909090909, - "grad_norm": 9.576207160949707, - "learning_rate": 9.741818181818182e-06, - "loss": 0.1253, - "step": 72 - }, - { - "epoch": 1.3272727272727272, - "grad_norm": 9.32327651977539, - "learning_rate": 9.738181818181818e-06, - "loss": 0.1248, - "step": 73 - }, - { - "epoch": 1.3454545454545455, - "grad_norm": 5.23323392868042, - "learning_rate": 9.734545454545455e-06, - "loss": 0.0938, - "step": 74 - }, - { - "epoch": 1.3636363636363638, - "grad_norm": 3.476034641265869, - "learning_rate": 9.730909090909092e-06, - "loss": 0.0879, - "step": 75 - }, - { - "epoch": 1.3818181818181818, - "grad_norm": 6.531972408294678, - "learning_rate": 9.727272727272728e-06, - "loss": 0.1043, - "step": 76 - }, - { - "epoch": 1.4, - "grad_norm": 5.5768914222717285, - "learning_rate": 9.723636363636363e-06, - "loss": 0.0876, - "step": 77 - }, - { - "epoch": 1.4181818181818182, - "grad_norm": 0.480271577835083, - "learning_rate": 9.72e-06, - "loss": 0.0945, - "step": 78 - }, - { - "epoch": 1.4363636363636363, - "grad_norm": 6.237101078033447, - "learning_rate": 9.716363636363637e-06, - "loss": 0.0963, - "step": 79 - }, - { - "epoch": 1.4545454545454546, - "grad_norm": 8.538503646850586, - "learning_rate": 9.712727272727274e-06, - "loss": 0.1083, - "step": 80 - }, - { - "epoch": 1.4727272727272727, - "grad_norm": 5.826234817504883, - "learning_rate": 9.70909090909091e-06, - "loss": 0.0792, - "step": 81 - }, - { - "epoch": 1.490909090909091, - "grad_norm": 1.0556098222732544, - "learning_rate": 9.705454545454547e-06, - "loss": 0.0826, - "step": 82 - }, - { - "epoch": 1.509090909090909, - "grad_norm": 2.9839580059051514, - "learning_rate": 9.701818181818182e-06, - "loss": 0.0961, - "step": 83 - }, - { - "epoch": 1.5272727272727273, - "grad_norm": 0.5258766412734985, - "learning_rate": 9.698181818181819e-06, - "loss": 0.0707, - "step": 84 - }, - { - "epoch": 1.5454545454545454, - "grad_norm": 0.3032699525356293, - "learning_rate": 9.694545454545455e-06, - "loss": 0.0818, - "step": 85 - }, - { - "epoch": 1.5636363636363635, - "grad_norm": 1.450802206993103, - "learning_rate": 9.690909090909092e-06, - "loss": 0.0938, - "step": 86 - }, - { - "epoch": 1.5818181818181818, - "grad_norm": 1.264906883239746, - "learning_rate": 9.687272727272729e-06, - "loss": 0.0825, - "step": 87 - }, - { - "epoch": 1.6, - "grad_norm": 0.19814220070838928, - "learning_rate": 9.683636363636364e-06, - "loss": 0.083, - "step": 88 - }, - { - "epoch": 1.6181818181818182, - "grad_norm": 1.0082038640975952, - "learning_rate": 9.68e-06, - "loss": 0.0957, - "step": 89 - }, - { - "epoch": 1.6363636363636362, - "grad_norm": 0.16697710752487183, - "learning_rate": 9.676363636363637e-06, - "loss": 0.095, - "step": 90 - }, - { - "epoch": 1.6545454545454545, - "grad_norm": 3.0858652591705322, - "learning_rate": 9.672727272727274e-06, - "loss": 0.0829, - "step": 91 - }, - { - "epoch": 1.6727272727272728, - "grad_norm": 0.28391602635383606, - "learning_rate": 9.66909090909091e-06, - "loss": 0.0884, - "step": 92 - }, - { - "epoch": 1.690909090909091, - "grad_norm": 0.18711575865745544, - "learning_rate": 9.665454545454547e-06, - "loss": 0.0818, - "step": 93 - }, - { - "epoch": 1.709090909090909, - "grad_norm": 1.8125078678131104, - "learning_rate": 9.661818181818182e-06, - "loss": 0.0845, - "step": 94 - }, - { - "epoch": 1.7272727272727273, - "grad_norm": 1.3579044342041016, - "learning_rate": 9.658181818181819e-06, - "loss": 0.0743, - "step": 95 - }, - { - "epoch": 1.7454545454545456, - "grad_norm": 0.7960511445999146, - "learning_rate": 9.654545454545456e-06, - "loss": 0.1005, - "step": 96 - }, - { - "epoch": 1.7636363636363637, - "grad_norm": 2.742464542388916, - "learning_rate": 9.650909090909092e-06, - "loss": 0.0935, - "step": 97 - }, - { - "epoch": 1.7818181818181817, - "grad_norm": 0.5498592853546143, - "learning_rate": 9.64727272727273e-06, - "loss": 0.082, - "step": 98 - }, - { - "epoch": 1.8, - "grad_norm": 1.8489519357681274, - "learning_rate": 9.643636363636364e-06, - "loss": 0.0767, - "step": 99 - }, - { - "epoch": 1.8181818181818183, - "grad_norm": 0.9120833277702332, - "learning_rate": 9.640000000000001e-06, - "loss": 0.0657, - "step": 100 - }, - { - "epoch": 1.8363636363636364, - "grad_norm": 0.5977105498313904, - "learning_rate": 9.636363636363638e-06, - "loss": 0.0823, - "step": 101 - }, - { - "epoch": 1.8545454545454545, - "grad_norm": 0.8194651007652283, - "learning_rate": 9.632727272727274e-06, - "loss": 0.0665, - "step": 102 - }, - { - "epoch": 1.8727272727272726, - "grad_norm": 0.6510651707649231, - "learning_rate": 9.629090909090911e-06, - "loss": 0.0791, - "step": 103 - }, - { - "epoch": 1.8909090909090909, - "grad_norm": 2.958561420440674, - "learning_rate": 9.625454545454546e-06, - "loss": 0.0755, - "step": 104 - }, - { - "epoch": 1.9090909090909092, - "grad_norm": 1.1365717649459839, - "learning_rate": 9.621818181818183e-06, - "loss": 0.0798, - "step": 105 - }, - { - "epoch": 1.9272727272727272, - "grad_norm": 0.3904542922973633, - "learning_rate": 9.61818181818182e-06, - "loss": 0.0797, - "step": 106 - }, - { - "epoch": 1.9454545454545453, - "grad_norm": 2.3255527019500732, - "learning_rate": 9.614545454545456e-06, - "loss": 0.0789, - "step": 107 - }, - { - "epoch": 1.9636363636363636, - "grad_norm": 0.8023898005485535, - "learning_rate": 9.610909090909091e-06, - "loss": 0.0797, - "step": 108 - }, - { - "epoch": 1.981818181818182, - "grad_norm": 1.1171375513076782, - "learning_rate": 9.607272727272728e-06, - "loss": 0.0781, - "step": 109 - }, - { - "epoch": 2.0, - "grad_norm": 3.1299049854278564, - "learning_rate": 9.603636363636365e-06, - "loss": 0.0825, - "step": 110 - }, - { - "epoch": 2.0, - "eval_loss": 0.08001140505075455, - "eval_runtime": 9.2515, - "eval_samples_per_second": 588.554, - "eval_steps_per_second": 73.61, - "step": 110 - }, - { - "epoch": 2.018181818181818, - "grad_norm": 3.0676770210266113, - "learning_rate": 9.600000000000001e-06, - "loss": 0.0934, - "step": 111 - }, - { - "epoch": 2.036363636363636, - "grad_norm": 1.0747575759887695, - "learning_rate": 9.596363636363636e-06, - "loss": 0.079, - "step": 112 - }, - { - "epoch": 2.0545454545454547, - "grad_norm": 3.7210536003112793, - "learning_rate": 9.592727272727273e-06, - "loss": 0.0698, - "step": 113 - }, - { - "epoch": 2.0727272727272728, - "grad_norm": 2.084446668624878, - "learning_rate": 9.58909090909091e-06, - "loss": 0.0657, - "step": 114 - }, - { - "epoch": 2.090909090909091, - "grad_norm": 0.47005435824394226, - "learning_rate": 9.585454545454546e-06, - "loss": 0.0812, - "step": 115 - }, - { - "epoch": 2.109090909090909, - "grad_norm": 2.7301416397094727, - "learning_rate": 9.581818181818181e-06, - "loss": 0.0623, - "step": 116 - }, - { - "epoch": 2.1272727272727274, - "grad_norm": 3.2547695636749268, - "learning_rate": 9.578181818181818e-06, - "loss": 0.0738, - "step": 117 - }, - { - "epoch": 2.1454545454545455, - "grad_norm": 3.0508649349212646, - "learning_rate": 9.574545454545455e-06, - "loss": 0.082, - "step": 118 - }, - { - "epoch": 2.1636363636363636, - "grad_norm": 3.0950398445129395, - "learning_rate": 9.570909090909092e-06, - "loss": 0.0808, - "step": 119 - }, - { - "epoch": 2.1818181818181817, - "grad_norm": 4.160436630249023, - "learning_rate": 9.567272727272728e-06, - "loss": 0.0793, - "step": 120 - }, - { - "epoch": 2.2, - "grad_norm": 1.2456765174865723, - "learning_rate": 9.563636363636365e-06, - "loss": 0.0611, - "step": 121 - }, - { - "epoch": 2.2181818181818183, - "grad_norm": 6.800936698913574, - "learning_rate": 9.56e-06, - "loss": 0.0776, - "step": 122 - }, - { - "epoch": 2.2363636363636363, - "grad_norm": 10.95822811126709, - "learning_rate": 9.556363636363637e-06, - "loss": 0.1031, - "step": 123 - }, - { - "epoch": 2.2545454545454544, - "grad_norm": 6.5864949226379395, - "learning_rate": 9.552727272727273e-06, - "loss": 0.0687, - "step": 124 - }, - { - "epoch": 2.2727272727272725, - "grad_norm": 2.7592520713806152, - "learning_rate": 9.54909090909091e-06, - "loss": 0.0615, - "step": 125 - }, - { - "epoch": 2.290909090909091, - "grad_norm": 7.407677173614502, - "learning_rate": 9.545454545454547e-06, - "loss": 0.1015, - "step": 126 - }, - { - "epoch": 2.309090909090909, - "grad_norm": 11.68075180053711, - "learning_rate": 9.541818181818182e-06, - "loss": 0.1254, - "step": 127 - }, - { - "epoch": 2.327272727272727, - "grad_norm": 12.06436538696289, - "learning_rate": 9.538181818181818e-06, - "loss": 0.1143, - "step": 128 - }, - { - "epoch": 2.3454545454545457, - "grad_norm": 6.120307445526123, - "learning_rate": 9.534545454545455e-06, - "loss": 0.072, - "step": 129 - }, - { - "epoch": 2.3636363636363638, - "grad_norm": 1.2238420248031616, - "learning_rate": 9.530909090909092e-06, - "loss": 0.0723, - "step": 130 - }, - { - "epoch": 2.381818181818182, - "grad_norm": 5.581143379211426, - "learning_rate": 9.527272727272729e-06, - "loss": 0.0826, - "step": 131 - }, - { - "epoch": 2.4, - "grad_norm": 4.355954170227051, - "learning_rate": 9.523636363636365e-06, - "loss": 0.079, - "step": 132 - }, - { - "epoch": 2.418181818181818, - "grad_norm": 2.2277379035949707, - "learning_rate": 9.52e-06, - "loss": 0.0722, - "step": 133 - }, - { - "epoch": 2.4363636363636365, - "grad_norm": 1.6481623649597168, - "learning_rate": 9.516363636363637e-06, - "loss": 0.0685, - "step": 134 - }, - { - "epoch": 2.4545454545454546, - "grad_norm": 0.6039285659790039, - "learning_rate": 9.512727272727274e-06, - "loss": 0.069, - "step": 135 - }, - { - "epoch": 2.4727272727272727, - "grad_norm": 2.430091619491577, - "learning_rate": 9.50909090909091e-06, - "loss": 0.0704, - "step": 136 - }, - { - "epoch": 2.4909090909090907, - "grad_norm": 0.7880024909973145, - "learning_rate": 9.505454545454547e-06, - "loss": 0.0588, - "step": 137 - }, - { - "epoch": 2.509090909090909, - "grad_norm": 0.39421379566192627, - "learning_rate": 9.501818181818182e-06, - "loss": 0.0658, - "step": 138 - }, - { - "epoch": 2.5272727272727273, - "grad_norm": 0.9048682451248169, - "learning_rate": 9.498181818181819e-06, - "loss": 0.0759, - "step": 139 - }, - { - "epoch": 2.5454545454545454, - "grad_norm": 3.013659954071045, - "learning_rate": 9.494545454545456e-06, - "loss": 0.075, - "step": 140 - }, - { - "epoch": 2.5636363636363635, - "grad_norm": 2.505497932434082, - "learning_rate": 9.490909090909092e-06, - "loss": 0.0763, - "step": 141 - }, - { - "epoch": 2.581818181818182, - "grad_norm": 0.930428147315979, - "learning_rate": 9.487272727272729e-06, - "loss": 0.0704, - "step": 142 - }, - { - "epoch": 2.6, - "grad_norm": 3.1685171127319336, - "learning_rate": 9.483636363636366e-06, - "loss": 0.0816, - "step": 143 - }, - { - "epoch": 2.618181818181818, - "grad_norm": 0.1914507895708084, - "learning_rate": 9.48e-06, - "loss": 0.093, - "step": 144 - }, - { - "epoch": 2.6363636363636362, - "grad_norm": 6.089909553527832, - "learning_rate": 9.476363636363637e-06, - "loss": 0.0819, - "step": 145 - }, - { - "epoch": 2.6545454545454543, - "grad_norm": 2.829538106918335, - "learning_rate": 9.472727272727274e-06, - "loss": 0.0657, - "step": 146 - }, - { - "epoch": 2.672727272727273, - "grad_norm": 1.3468666076660156, - "learning_rate": 9.46909090909091e-06, - "loss": 0.0904, - "step": 147 - }, - { - "epoch": 2.690909090909091, - "grad_norm": 2.2511019706726074, - "learning_rate": 9.465454545454546e-06, - "loss": 0.0624, - "step": 148 - }, - { - "epoch": 2.709090909090909, - "grad_norm": 3.225182294845581, - "learning_rate": 9.461818181818183e-06, - "loss": 0.0785, - "step": 149 - }, - { - "epoch": 2.7272727272727275, - "grad_norm": 2.025791645050049, - "learning_rate": 9.45818181818182e-06, - "loss": 0.064, - "step": 150 - }, - { - "epoch": 2.7454545454545456, - "grad_norm": 3.491480588912964, - "learning_rate": 9.454545454545456e-06, - "loss": 0.0784, - "step": 151 - }, - { - "epoch": 2.7636363636363637, - "grad_norm": 3.9485855102539062, - "learning_rate": 9.450909090909091e-06, - "loss": 0.076, - "step": 152 - }, - { - "epoch": 2.7818181818181817, - "grad_norm": 0.681100070476532, - "learning_rate": 9.447272727272728e-06, - "loss": 0.0742, - "step": 153 - }, - { - "epoch": 2.8, - "grad_norm": 0.9131883382797241, - "learning_rate": 9.443636363636364e-06, - "loss": 0.0779, - "step": 154 - }, - { - "epoch": 2.8181818181818183, - "grad_norm": 0.1974269300699234, - "learning_rate": 9.440000000000001e-06, - "loss": 0.0625, - "step": 155 - }, - { - "epoch": 2.8363636363636364, - "grad_norm": 6.462656021118164, - "learning_rate": 9.436363636363636e-06, - "loss": 0.099, - "step": 156 - }, - { - "epoch": 2.8545454545454545, - "grad_norm": 2.845747232437134, - "learning_rate": 9.432727272727273e-06, - "loss": 0.052, - "step": 157 - }, - { - "epoch": 2.8727272727272726, - "grad_norm": 2.281165599822998, - "learning_rate": 9.42909090909091e-06, - "loss": 0.0791, - "step": 158 - }, - { - "epoch": 2.8909090909090907, - "grad_norm": 1.4044369459152222, - "learning_rate": 9.425454545454546e-06, - "loss": 0.0886, - "step": 159 - }, - { - "epoch": 2.909090909090909, - "grad_norm": 0.5744118094444275, - "learning_rate": 9.421818181818183e-06, - "loss": 0.0706, - "step": 160 - }, - { - "epoch": 2.9272727272727272, - "grad_norm": 3.8460161685943604, - "learning_rate": 9.418181818181818e-06, - "loss": 0.0668, - "step": 161 - }, - { - "epoch": 2.9454545454545453, - "grad_norm": 5.384547233581543, - "learning_rate": 9.414545454545455e-06, - "loss": 0.0968, - "step": 162 - }, - { - "epoch": 2.963636363636364, - "grad_norm": 0.7490556836128235, - "learning_rate": 9.410909090909091e-06, - "loss": 0.0624, - "step": 163 - }, - { - "epoch": 2.981818181818182, - "grad_norm": 5.801324367523193, - "learning_rate": 9.407272727272728e-06, - "loss": 0.0966, - "step": 164 - }, - { - "epoch": 3.0, - "grad_norm": 7.313564777374268, - "learning_rate": 9.403636363636365e-06, - "loss": 0.064, - "step": 165 - }, - { - "epoch": 3.0, - "eval_loss": 0.08795104175806046, - "eval_runtime": 9.2415, - "eval_samples_per_second": 589.188, - "eval_steps_per_second": 73.689, - "step": 165 - }, - { - "epoch": 3.018181818181818, - "grad_norm": 6.913625240325928, - "learning_rate": 9.4e-06, - "loss": 0.0855, - "step": 166 - }, - { - "epoch": 3.036363636363636, - "grad_norm": 0.7381065487861633, - "learning_rate": 9.396363636363636e-06, - "loss": 0.0713, - "step": 167 - }, - { - "epoch": 3.0545454545454547, - "grad_norm": 7.956295013427734, - "learning_rate": 9.392727272727273e-06, - "loss": 0.0951, - "step": 168 - }, - { - "epoch": 3.0727272727272728, - "grad_norm": 10.571431159973145, - "learning_rate": 9.38909090909091e-06, - "loss": 0.1055, - "step": 169 - }, - { - "epoch": 3.090909090909091, - "grad_norm": 8.65732479095459, - "learning_rate": 9.385454545454547e-06, - "loss": 0.0814, - "step": 170 - }, - { - "epoch": 3.109090909090909, - "grad_norm": 5.023076057434082, - "learning_rate": 9.381818181818183e-06, - "loss": 0.0758, - "step": 171 - }, - { - "epoch": 3.1272727272727274, - "grad_norm": 6.323506832122803, - "learning_rate": 9.378181818181818e-06, - "loss": 0.0693, - "step": 172 - }, - { - "epoch": 3.1454545454545455, - "grad_norm": 8.620816230773926, - "learning_rate": 9.374545454545455e-06, - "loss": 0.0745, - "step": 173 - }, - { - "epoch": 3.1636363636363636, - "grad_norm": 7.247929096221924, - "learning_rate": 9.370909090909092e-06, - "loss": 0.085, - "step": 174 - }, - { - "epoch": 3.1818181818181817, - "grad_norm": 3.6532492637634277, - "learning_rate": 9.367272727272728e-06, - "loss": 0.0618, - "step": 175 - }, - { - "epoch": 3.2, - "grad_norm": 5.601878643035889, - "learning_rate": 9.363636363636365e-06, - "loss": 0.0834, - "step": 176 - }, - { - "epoch": 3.2181818181818183, - "grad_norm": 9.302410125732422, - "learning_rate": 9.360000000000002e-06, - "loss": 0.0976, - "step": 177 - }, - { - "epoch": 3.2363636363636363, - "grad_norm": 8.256030082702637, - "learning_rate": 9.356363636363637e-06, - "loss": 0.0825, - "step": 178 - }, - { - "epoch": 3.2545454545454544, - "grad_norm": 2.888793468475342, - "learning_rate": 9.352727272727274e-06, - "loss": 0.0786, - "step": 179 - }, - { - "epoch": 3.2727272727272725, - "grad_norm": 6.941897869110107, - "learning_rate": 9.34909090909091e-06, - "loss": 0.0849, - "step": 180 - }, - { - "epoch": 3.290909090909091, - "grad_norm": 8.527759552001953, - "learning_rate": 9.345454545454547e-06, - "loss": 0.0974, - "step": 181 - }, - { - "epoch": 3.309090909090909, - "grad_norm": 9.929311752319336, - "learning_rate": 9.341818181818184e-06, - "loss": 0.085, - "step": 182 - }, - { - "epoch": 3.327272727272727, - "grad_norm": 5.477362632751465, - "learning_rate": 9.338181818181819e-06, - "loss": 0.079, - "step": 183 - }, - { - "epoch": 3.3454545454545457, - "grad_norm": 3.391834259033203, - "learning_rate": 9.334545454545455e-06, - "loss": 0.0712, - "step": 184 - }, - { - "epoch": 3.3636363636363638, - "grad_norm": 6.883739948272705, - "learning_rate": 9.330909090909092e-06, - "loss": 0.0812, - "step": 185 - }, - { - "epoch": 3.381818181818182, - "grad_norm": 6.06065034866333, - "learning_rate": 9.327272727272729e-06, - "loss": 0.0641, - "step": 186 - }, - { - "epoch": 3.4, - "grad_norm": 0.7123297452926636, - "learning_rate": 9.323636363636366e-06, - "loss": 0.0676, - "step": 187 - }, - { - "epoch": 3.418181818181818, - "grad_norm": 5.412281513214111, - "learning_rate": 9.32e-06, - "loss": 0.0764, - "step": 188 - }, - { - "epoch": 3.4363636363636365, - "grad_norm": 8.968116760253906, - "learning_rate": 9.316363636363637e-06, - "loss": 0.0867, - "step": 189 - }, - { - "epoch": 3.4545454545454546, - "grad_norm": 5.8010454177856445, - "learning_rate": 9.312727272727274e-06, - "loss": 0.072, - "step": 190 - }, - { - "epoch": 3.4727272727272727, - "grad_norm": 0.6209384799003601, - "learning_rate": 9.30909090909091e-06, - "loss": 0.0719, - "step": 191 - }, - { - "epoch": 3.4909090909090907, - "grad_norm": 6.320188045501709, - "learning_rate": 9.305454545454546e-06, - "loss": 0.0818, - "step": 192 - }, - { - "epoch": 3.509090909090909, - "grad_norm": 10.104494094848633, - "learning_rate": 9.301818181818182e-06, - "loss": 0.1017, - "step": 193 - }, - { - "epoch": 3.5272727272727273, - "grad_norm": 7.651689529418945, - "learning_rate": 9.298181818181819e-06, - "loss": 0.0731, - "step": 194 - }, - { - "epoch": 3.5454545454545454, - "grad_norm": 3.0021798610687256, - "learning_rate": 9.294545454545456e-06, - "loss": 0.0622, - "step": 195 - }, - { - "epoch": 3.5636363636363635, - "grad_norm": 4.958535671234131, - "learning_rate": 9.29090909090909e-06, - "loss": 0.0945, - "step": 196 - }, - { - "epoch": 3.581818181818182, - "grad_norm": 9.10540771484375, - "learning_rate": 9.287272727272728e-06, - "loss": 0.0985, - "step": 197 - }, - { - "epoch": 3.6, - "grad_norm": 7.740372180938721, - "learning_rate": 9.283636363636364e-06, - "loss": 0.1007, - "step": 198 - }, - { - "epoch": 3.618181818181818, - "grad_norm": 3.880146026611328, - "learning_rate": 9.280000000000001e-06, - "loss": 0.0903, - "step": 199 - }, - { - "epoch": 3.6363636363636362, - "grad_norm": 4.8426513671875, - "learning_rate": 9.276363636363636e-06, - "loss": 0.0864, - "step": 200 - }, - { - "epoch": 3.6545454545454543, - "grad_norm": 5.517488479614258, - "learning_rate": 9.272727272727273e-06, - "loss": 0.0828, - "step": 201 - }, - { - "epoch": 3.672727272727273, - "grad_norm": 5.299657821655273, - "learning_rate": 9.26909090909091e-06, - "loss": 0.0783, - "step": 202 - }, - { - "epoch": 3.690909090909091, - "grad_norm": 0.6263226866722107, - "learning_rate": 9.265454545454546e-06, - "loss": 0.0785, - "step": 203 - }, - { - "epoch": 3.709090909090909, - "grad_norm": 5.731221675872803, - "learning_rate": 9.261818181818183e-06, - "loss": 0.0767, - "step": 204 - }, - { - "epoch": 3.7272727272727275, - "grad_norm": 6.0733232498168945, - "learning_rate": 9.25818181818182e-06, - "loss": 0.0746, - "step": 205 - }, - { - "epoch": 3.7454545454545456, - "grad_norm": 6.308950424194336, - "learning_rate": 9.254545454545454e-06, - "loss": 0.0725, - "step": 206 - }, - { - "epoch": 3.7636363636363637, - "grad_norm": 0.6078853607177734, - "learning_rate": 9.250909090909091e-06, - "loss": 0.0467, - "step": 207 - }, - { - "epoch": 3.7818181818181817, - "grad_norm": 6.576251029968262, - "learning_rate": 9.247272727272728e-06, - "loss": 0.0695, - "step": 208 - }, - { - "epoch": 3.8, - "grad_norm": 9.8453950881958, - "learning_rate": 9.243636363636365e-06, - "loss": 0.0844, - "step": 209 - }, - { - "epoch": 3.8181818181818183, - "grad_norm": 7.989305019378662, - "learning_rate": 9.240000000000001e-06, - "loss": 0.0898, - "step": 210 - }, - { - "epoch": 3.8363636363636364, - "grad_norm": 2.978806972503662, - "learning_rate": 9.236363636363636e-06, - "loss": 0.0774, - "step": 211 - }, - { - "epoch": 3.8545454545454545, - "grad_norm": 6.208574295043945, - "learning_rate": 9.232727272727273e-06, - "loss": 0.0783, - "step": 212 - }, - { - "epoch": 3.8727272727272726, - "grad_norm": 8.810700416564941, - "learning_rate": 9.22909090909091e-06, - "loss": 0.1001, - "step": 213 - }, - { - "epoch": 3.8909090909090907, - "grad_norm": 8.258453369140625, - "learning_rate": 9.225454545454546e-06, - "loss": 0.0965, - "step": 214 - }, - { - "epoch": 3.909090909090909, - "grad_norm": 6.546050548553467, - "learning_rate": 9.221818181818183e-06, - "loss": 0.0749, - "step": 215 - }, - { - "epoch": 3.9272727272727272, - "grad_norm": 3.9093575477600098, - "learning_rate": 9.21818181818182e-06, - "loss": 0.0829, - "step": 216 - }, - { - "epoch": 3.9454545454545453, - "grad_norm": 6.779893398284912, - "learning_rate": 9.214545454545455e-06, - "loss": 0.0922, - "step": 217 - }, - { - "epoch": 3.963636363636364, - "grad_norm": 5.047027587890625, - "learning_rate": 9.210909090909092e-06, - "loss": 0.0807, - "step": 218 - }, - { - "epoch": 3.981818181818182, - "grad_norm": 1.5959489345550537, - "learning_rate": 9.207272727272728e-06, - "loss": 0.0659, - "step": 219 - }, - { - "epoch": 4.0, - "grad_norm": 6.1624603271484375, - "learning_rate": 9.203636363636365e-06, - "loss": 0.0633, - "step": 220 - }, - { - "epoch": 4.0, - "eval_loss": 0.11175768822431564, - "eval_runtime": 9.3166, - "eval_samples_per_second": 584.443, - "eval_steps_per_second": 73.096, - "step": 220 - }, - { - "epoch": 4.0181818181818185, - "grad_norm": 11.342811584472656, - "learning_rate": 9.200000000000002e-06, - "loss": 0.0999, - "step": 221 - }, - { - "epoch": 4.036363636363636, - "grad_norm": 8.774984359741211, - "learning_rate": 9.196363636363637e-06, - "loss": 0.0985, - "step": 222 - }, - { - "epoch": 4.054545454545455, - "grad_norm": 6.042727470397949, - "learning_rate": 9.192727272727273e-06, - "loss": 0.0784, - "step": 223 - }, - { - "epoch": 4.072727272727272, - "grad_norm": 1.246486783027649, - "learning_rate": 9.18909090909091e-06, - "loss": 0.0782, - "step": 224 - }, - { - "epoch": 4.090909090909091, - "grad_norm": 5.716378211975098, - "learning_rate": 9.185454545454547e-06, - "loss": 0.0684, - "step": 225 - }, - { - "epoch": 4.109090909090909, - "grad_norm": 3.366102457046509, - "learning_rate": 9.181818181818184e-06, - "loss": 0.0696, - "step": 226 - }, - { - "epoch": 4.127272727272727, - "grad_norm": 0.878805935382843, - "learning_rate": 9.17818181818182e-06, - "loss": 0.0679, - "step": 227 - }, - { - "epoch": 4.1454545454545455, - "grad_norm": 7.174416542053223, - "learning_rate": 9.174545454545455e-06, - "loss": 0.0793, - "step": 228 - }, - { - "epoch": 4.163636363636364, - "grad_norm": 10.28535270690918, - "learning_rate": 9.170909090909092e-06, - "loss": 0.0966, - "step": 229 - }, - { - "epoch": 4.181818181818182, - "grad_norm": 9.594989776611328, - "learning_rate": 9.167272727272729e-06, - "loss": 0.0973, - "step": 230 - }, - { - "epoch": 4.2, - "grad_norm": 3.597465991973877, - "learning_rate": 9.163636363636365e-06, - "loss": 0.083, - "step": 231 - }, - { - "epoch": 4.218181818181818, - "grad_norm": 4.103036880493164, - "learning_rate": 9.16e-06, - "loss": 0.0816, - "step": 232 - }, - { - "epoch": 4.236363636363636, - "grad_norm": 8.228384971618652, - "learning_rate": 9.156363636363637e-06, - "loss": 0.094, - "step": 233 - }, - { - "epoch": 4.254545454545455, - "grad_norm": 7.679332256317139, - "learning_rate": 9.152727272727274e-06, - "loss": 0.0877, - "step": 234 - }, - { - "epoch": 4.2727272727272725, - "grad_norm": 2.864004611968994, - "learning_rate": 9.14909090909091e-06, - "loss": 0.0801, - "step": 235 - }, - { - "epoch": 4.290909090909091, - "grad_norm": 4.906161308288574, - "learning_rate": 9.145454545454546e-06, - "loss": 0.0846, - "step": 236 - }, - { - "epoch": 4.309090909090909, - "grad_norm": 6.705206871032715, - "learning_rate": 9.141818181818182e-06, - "loss": 0.0808, - "step": 237 - }, - { - "epoch": 4.327272727272727, - "grad_norm": 6.570376396179199, - "learning_rate": 9.138181818181819e-06, - "loss": 0.0734, - "step": 238 - }, - { - "epoch": 4.345454545454546, - "grad_norm": 2.2556116580963135, - "learning_rate": 9.134545454545456e-06, - "loss": 0.071, - "step": 239 - }, - { - "epoch": 4.363636363636363, - "grad_norm": 6.6670379638671875, - "learning_rate": 9.13090909090909e-06, - "loss": 0.0782, - "step": 240 - }, - { - "epoch": 4.381818181818182, - "grad_norm": 8.097082138061523, - "learning_rate": 9.127272727272727e-06, - "loss": 0.0654, - "step": 241 - }, - { - "epoch": 4.4, - "grad_norm": 8.15650463104248, - "learning_rate": 9.123636363636364e-06, - "loss": 0.0849, - "step": 242 - }, - { - "epoch": 4.418181818181818, - "grad_norm": 4.7494425773620605, - "learning_rate": 9.12e-06, - "loss": 0.0875, - "step": 243 - }, - { - "epoch": 4.4363636363636365, - "grad_norm": 1.80208420753479, - "learning_rate": 9.116363636363637e-06, - "loss": 0.0678, - "step": 244 - }, - { - "epoch": 4.454545454545454, - "grad_norm": 5.914602279663086, - "learning_rate": 9.112727272727272e-06, - "loss": 0.083, - "step": 245 - }, - { - "epoch": 4.472727272727273, - "grad_norm": 4.887852668762207, - "learning_rate": 9.10909090909091e-06, - "loss": 0.0702, - "step": 246 - }, - { - "epoch": 4.490909090909091, - "grad_norm": 0.9011878371238708, - "learning_rate": 9.105454545454546e-06, - "loss": 0.0512, - "step": 247 - }, - { - "epoch": 4.509090909090909, - "grad_norm": 5.650793552398682, - "learning_rate": 9.101818181818183e-06, - "loss": 0.0597, - "step": 248 - }, - { - "epoch": 4.527272727272727, - "grad_norm": 8.356748580932617, - "learning_rate": 9.09818181818182e-06, - "loss": 0.0957, - "step": 249 - }, - { - "epoch": 4.545454545454545, - "grad_norm": 6.626115798950195, - "learning_rate": 9.094545454545454e-06, - "loss": 0.0831, - "step": 250 - }, - { - "epoch": 4.5636363636363635, - "grad_norm": 1.9693832397460938, - "learning_rate": 9.090909090909091e-06, - "loss": 0.0569, - "step": 251 - }, - { - "epoch": 4.581818181818182, - "grad_norm": 4.674869537353516, - "learning_rate": 9.087272727272728e-06, - "loss": 0.0613, - "step": 252 - }, - { - "epoch": 4.6, - "grad_norm": 9.009145736694336, - "learning_rate": 9.083636363636364e-06, - "loss": 0.0892, - "step": 253 - }, - { - "epoch": 4.618181818181818, - "grad_norm": 7.072900772094727, - "learning_rate": 9.080000000000001e-06, - "loss": 0.0861, - "step": 254 - }, - { - "epoch": 4.636363636363637, - "grad_norm": 4.235187530517578, - "learning_rate": 9.076363636363638e-06, - "loss": 0.0636, - "step": 255 - }, - { - "epoch": 4.654545454545454, - "grad_norm": 3.3887217044830322, - "learning_rate": 9.072727272727273e-06, - "loss": 0.0737, - "step": 256 - }, - { - "epoch": 4.672727272727273, - "grad_norm": 5.327084541320801, - "learning_rate": 9.06909090909091e-06, - "loss": 0.0672, - "step": 257 - }, - { - "epoch": 4.690909090909091, - "grad_norm": 4.684718132019043, - "learning_rate": 9.065454545454546e-06, - "loss": 0.0668, - "step": 258 - }, - { - "epoch": 4.709090909090909, - "grad_norm": 0.5791177749633789, - "learning_rate": 9.061818181818183e-06, - "loss": 0.0492, - "step": 259 - }, - { - "epoch": 4.7272727272727275, - "grad_norm": 4.028196334838867, - "learning_rate": 9.05818181818182e-06, - "loss": 0.0799, - "step": 260 - }, - { - "epoch": 4.745454545454545, - "grad_norm": 4.580573558807373, - "learning_rate": 9.054545454545455e-06, - "loss": 0.0673, - "step": 261 - }, - { - "epoch": 4.763636363636364, - "grad_norm": 4.59187650680542, - "learning_rate": 9.050909090909091e-06, - "loss": 0.0662, - "step": 262 - }, - { - "epoch": 4.781818181818182, - "grad_norm": 1.2157129049301147, - "learning_rate": 9.047272727272728e-06, - "loss": 0.0675, - "step": 263 - }, - { - "epoch": 4.8, - "grad_norm": 1.7148888111114502, - "learning_rate": 9.043636363636365e-06, - "loss": 0.0754, - "step": 264 - }, - { - "epoch": 4.818181818181818, - "grad_norm": 1.0630745887756348, - "learning_rate": 9.040000000000002e-06, - "loss": 0.0717, - "step": 265 - }, - { - "epoch": 4.836363636363636, - "grad_norm": 3.748553514480591, - "learning_rate": 9.036363636363638e-06, - "loss": 0.0655, - "step": 266 - }, - { - "epoch": 4.8545454545454545, - "grad_norm": 5.848458766937256, - "learning_rate": 9.032727272727273e-06, - "loss": 0.0851, - "step": 267 - }, - { - "epoch": 4.872727272727273, - "grad_norm": 2.563995122909546, - "learning_rate": 9.02909090909091e-06, - "loss": 0.0667, - "step": 268 - }, - { - "epoch": 4.890909090909091, - "grad_norm": 0.999470591545105, - "learning_rate": 9.025454545454547e-06, - "loss": 0.0853, - "step": 269 - }, - { - "epoch": 4.909090909090909, - "grad_norm": 8.2174654006958, - "learning_rate": 9.021818181818183e-06, - "loss": 0.0854, - "step": 270 - }, - { - "epoch": 4.927272727272728, - "grad_norm": 11.609013557434082, - "learning_rate": 9.01818181818182e-06, - "loss": 0.1122, - "step": 271 - }, - { - "epoch": 4.945454545454545, - "grad_norm": 11.43703556060791, - "learning_rate": 9.014545454545455e-06, - "loss": 0.0968, - "step": 272 - }, - { - "epoch": 4.963636363636364, - "grad_norm": 7.838481903076172, - "learning_rate": 9.010909090909092e-06, - "loss": 0.0922, - "step": 273 - }, - { - "epoch": 4.9818181818181815, - "grad_norm": 1.7378261089324951, - "learning_rate": 9.007272727272729e-06, - "loss": 0.0623, - "step": 274 - }, - { - "epoch": 5.0, - "grad_norm": 5.073530197143555, - "learning_rate": 9.003636363636365e-06, - "loss": 0.0898, - "step": 275 - }, - { - "epoch": 5.0, - "eval_loss": 0.11605573445558548, - "eval_runtime": 9.35, - "eval_samples_per_second": 582.352, - "eval_steps_per_second": 72.834, - "step": 275 - }, - { - "epoch": 5.0181818181818185, - "grad_norm": 12.678677558898926, - "learning_rate": 9e-06, - "loss": 0.1081, - "step": 276 - }, - { - "epoch": 5.036363636363636, - "grad_norm": 11.12718677520752, - "learning_rate": 8.996363636363637e-06, - "loss": 0.1167, - "step": 277 - }, - { - "epoch": 5.054545454545455, - "grad_norm": 12.390442848205566, - "learning_rate": 8.992727272727274e-06, - "loss": 0.1202, - "step": 278 - }, - { - "epoch": 5.072727272727272, - "grad_norm": 5.101798057556152, - "learning_rate": 8.98909090909091e-06, - "loss": 0.0671, - "step": 279 - }, - { - "epoch": 5.090909090909091, - "grad_norm": 2.9029312133789062, - "learning_rate": 8.985454545454545e-06, - "loss": 0.0543, - "step": 280 - }, - { - "epoch": 5.109090909090909, - "grad_norm": 8.147836685180664, - "learning_rate": 8.981818181818182e-06, - "loss": 0.0906, - "step": 281 - }, - { - "epoch": 5.127272727272727, - "grad_norm": 7.752715110778809, - "learning_rate": 8.978181818181819e-06, - "loss": 0.0903, - "step": 282 - }, - { - "epoch": 5.1454545454545455, - "grad_norm": 4.003704071044922, - "learning_rate": 8.974545454545455e-06, - "loss": 0.076, - "step": 283 - }, - { - "epoch": 5.163636363636364, - "grad_norm": 0.46452245116233826, - "learning_rate": 8.97090909090909e-06, - "loss": 0.0666, - "step": 284 - }, - { - "epoch": 5.181818181818182, - "grad_norm": 5.349115371704102, - "learning_rate": 8.967272727272727e-06, - "loss": 0.0685, - "step": 285 - }, - { - "epoch": 5.2, - "grad_norm": 7.328717231750488, - "learning_rate": 8.963636363636364e-06, - "loss": 0.0752, - "step": 286 - }, - { - "epoch": 5.218181818181818, - "grad_norm": 4.938404083251953, - "learning_rate": 8.96e-06, - "loss": 0.0713, - "step": 287 - }, - { - "epoch": 5.236363636363636, - "grad_norm": 1.616647720336914, - "learning_rate": 8.956363636363637e-06, - "loss": 0.0799, - "step": 288 - }, - { - "epoch": 5.254545454545455, - "grad_norm": 4.676848888397217, - "learning_rate": 8.952727272727272e-06, - "loss": 0.0906, - "step": 289 - }, - { - "epoch": 5.2727272727272725, - "grad_norm": 1.2149401903152466, - "learning_rate": 8.949090909090909e-06, - "loss": 0.0745, - "step": 290 - }, - { - "epoch": 5.290909090909091, - "grad_norm": 2.674747943878174, - "learning_rate": 8.945454545454546e-06, - "loss": 0.0601, - "step": 291 - }, - { - "epoch": 5.309090909090909, - "grad_norm": 2.8979029655456543, - "learning_rate": 8.941818181818182e-06, - "loss": 0.0835, - "step": 292 - }, - { - "epoch": 5.327272727272727, - "grad_norm": 1.1090362071990967, - "learning_rate": 8.93818181818182e-06, - "loss": 0.0626, - "step": 293 - }, - { - "epoch": 5.345454545454546, - "grad_norm": 5.165194988250732, - "learning_rate": 8.934545454545456e-06, - "loss": 0.0797, - "step": 294 - }, - { - "epoch": 5.363636363636363, - "grad_norm": 5.731832504272461, - "learning_rate": 8.930909090909091e-06, - "loss": 0.0678, - "step": 295 - }, - { - "epoch": 5.381818181818182, - "grad_norm": 5.0275115966796875, - "learning_rate": 8.927272727272728e-06, - "loss": 0.0683, - "step": 296 - }, - { - "epoch": 5.4, - "grad_norm": 0.6816781759262085, - "learning_rate": 8.923636363636364e-06, - "loss": 0.0812, - "step": 297 - }, - { - "epoch": 5.418181818181818, - "grad_norm": 2.9658639430999756, - "learning_rate": 8.920000000000001e-06, - "loss": 0.0753, - "step": 298 - }, - { - "epoch": 5.4363636363636365, - "grad_norm": 3.4258341789245605, - "learning_rate": 8.916363636363638e-06, - "loss": 0.0532, - "step": 299 - }, - { - "epoch": 5.454545454545454, - "grad_norm": 1.3165113925933838, - "learning_rate": 8.912727272727274e-06, - "loss": 0.0606, - "step": 300 - }, - { - "epoch": 5.472727272727273, - "grad_norm": 3.113178253173828, - "learning_rate": 8.90909090909091e-06, - "loss": 0.0607, - "step": 301 - }, - { - "epoch": 5.490909090909091, - "grad_norm": 0.8872824907302856, - "learning_rate": 8.905454545454546e-06, - "loss": 0.0597, - "step": 302 - }, - { - "epoch": 5.509090909090909, - "grad_norm": 0.9222560524940491, - "learning_rate": 8.901818181818183e-06, - "loss": 0.0626, - "step": 303 - }, - { - "epoch": 5.527272727272727, - "grad_norm": 3.6576836109161377, - "learning_rate": 8.89818181818182e-06, - "loss": 0.0748, - "step": 304 - }, - { - "epoch": 5.545454545454545, - "grad_norm": 1.5232235193252563, - "learning_rate": 8.894545454545456e-06, - "loss": 0.0545, - "step": 305 - }, - { - "epoch": 5.5636363636363635, - "grad_norm": 0.4403159022331238, - "learning_rate": 8.890909090909091e-06, - "loss": 0.0754, - "step": 306 - }, - { - "epoch": 5.581818181818182, - "grad_norm": 4.333779335021973, - "learning_rate": 8.887272727272728e-06, - "loss": 0.0755, - "step": 307 - }, - { - "epoch": 5.6, - "grad_norm": 4.799773216247559, - "learning_rate": 8.883636363636365e-06, - "loss": 0.0626, - "step": 308 - }, - { - "epoch": 5.618181818181818, - "grad_norm": 0.9533578157424927, - "learning_rate": 8.880000000000001e-06, - "loss": 0.0607, - "step": 309 - }, - { - "epoch": 5.636363636363637, - "grad_norm": 1.9961360692977905, - "learning_rate": 8.876363636363638e-06, - "loss": 0.0625, - "step": 310 - }, - { - "epoch": 5.654545454545454, - "grad_norm": 3.6371355056762695, - "learning_rate": 8.872727272727275e-06, - "loss": 0.0729, - "step": 311 - }, - { - "epoch": 5.672727272727273, - "grad_norm": 1.9729257822036743, - "learning_rate": 8.86909090909091e-06, - "loss": 0.0755, - "step": 312 - }, - { - "epoch": 5.690909090909091, - "grad_norm": 0.9291281700134277, - "learning_rate": 8.865454545454547e-06, - "loss": 0.0596, - "step": 313 - }, - { - "epoch": 5.709090909090909, - "grad_norm": 3.7816944122314453, - "learning_rate": 8.861818181818183e-06, - "loss": 0.0718, - "step": 314 - }, - { - "epoch": 5.7272727272727275, - "grad_norm": 1.5624970197677612, - "learning_rate": 8.85818181818182e-06, - "loss": 0.0779, - "step": 315 - }, - { - "epoch": 5.745454545454545, - "grad_norm": 3.4021527767181396, - "learning_rate": 8.854545454545455e-06, - "loss": 0.0633, - "step": 316 - }, - { - "epoch": 5.763636363636364, - "grad_norm": 5.3682355880737305, - "learning_rate": 8.850909090909092e-06, - "loss": 0.0713, - "step": 317 - }, - { - "epoch": 5.781818181818182, - "grad_norm": 1.8995529413223267, - "learning_rate": 8.847272727272728e-06, - "loss": 0.0645, - "step": 318 - }, - { - "epoch": 5.8, - "grad_norm": 2.4608638286590576, - "learning_rate": 8.843636363636365e-06, - "loss": 0.0616, - "step": 319 - }, - { - "epoch": 5.818181818181818, - "grad_norm": 3.6318161487579346, - "learning_rate": 8.84e-06, - "loss": 0.0646, - "step": 320 - }, - { - "epoch": 5.836363636363636, - "grad_norm": 1.7944884300231934, - "learning_rate": 8.836363636363637e-06, - "loss": 0.0724, - "step": 321 - }, - { - "epoch": 5.8545454545454545, - "grad_norm": 2.7613065242767334, - "learning_rate": 8.832727272727273e-06, - "loss": 0.0565, - "step": 322 - }, - { - "epoch": 5.872727272727273, - "grad_norm": 5.100012302398682, - "learning_rate": 8.82909090909091e-06, - "loss": 0.0714, - "step": 323 - }, - { - "epoch": 5.890909090909091, - "grad_norm": 3.113781213760376, - "learning_rate": 8.825454545454545e-06, - "loss": 0.0783, - "step": 324 - }, - { - "epoch": 5.909090909090909, - "grad_norm": 1.5907810926437378, - "learning_rate": 8.821818181818182e-06, - "loss": 0.0689, - "step": 325 - }, - { - "epoch": 5.927272727272728, - "grad_norm": 3.4750540256500244, - "learning_rate": 8.818181818181819e-06, - "loss": 0.0796, - "step": 326 - }, - { - "epoch": 5.945454545454545, - "grad_norm": 2.405463457107544, - "learning_rate": 8.814545454545455e-06, - "loss": 0.0513, - "step": 327 - }, - { - "epoch": 5.963636363636364, - "grad_norm": 3.3527348041534424, - "learning_rate": 8.81090909090909e-06, - "loss": 0.0692, - "step": 328 - }, - { - "epoch": 5.9818181818181815, - "grad_norm": 4.547636032104492, - "learning_rate": 8.807272727272727e-06, - "loss": 0.0738, - "step": 329 - }, - { - "epoch": 6.0, - "grad_norm": 1.594735026359558, - "learning_rate": 8.803636363636364e-06, - "loss": 0.059, - "step": 330 - }, - { - "epoch": 6.0, - "eval_loss": 0.06676711142063141, - "eval_runtime": 8.8819, - "eval_samples_per_second": 613.045, - "eval_steps_per_second": 76.673, - "step": 330 - }, - { - "epoch": 6.0181818181818185, - "grad_norm": 3.014591932296753, - "learning_rate": 8.8e-06, - "loss": 0.063, - "step": 331 - }, - { - "epoch": 6.036363636363636, - "grad_norm": 2.3795886039733887, - "learning_rate": 8.796363636363637e-06, - "loss": 0.069, - "step": 332 - }, - { - "epoch": 6.054545454545455, - "grad_norm": 3.4229989051818848, - "learning_rate": 8.792727272727274e-06, - "loss": 0.0644, - "step": 333 - }, - { - "epoch": 6.072727272727272, - "grad_norm": 3.943129301071167, - "learning_rate": 8.789090909090909e-06, - "loss": 0.0709, - "step": 334 - }, - { - "epoch": 6.090909090909091, - "grad_norm": 3.517542600631714, - "learning_rate": 8.785454545454546e-06, - "loss": 0.0645, - "step": 335 - }, - { - "epoch": 6.109090909090909, - "grad_norm": 3.136274576187134, - "learning_rate": 8.781818181818182e-06, - "loss": 0.0669, - "step": 336 - }, - { - "epoch": 6.127272727272727, - "grad_norm": 1.5289356708526611, - "learning_rate": 8.778181818181819e-06, - "loss": 0.0737, - "step": 337 - }, - { - "epoch": 6.1454545454545455, - "grad_norm": 4.112996578216553, - "learning_rate": 8.774545454545456e-06, - "loss": 0.0676, - "step": 338 - }, - { - "epoch": 6.163636363636364, - "grad_norm": 2.3339052200317383, - "learning_rate": 8.770909090909092e-06, - "loss": 0.0666, - "step": 339 - }, - { - "epoch": 6.181818181818182, - "grad_norm": 2.275592565536499, - "learning_rate": 8.767272727272727e-06, - "loss": 0.0691, - "step": 340 - }, - { - "epoch": 6.2, - "grad_norm": 4.114313125610352, - "learning_rate": 8.763636363636364e-06, - "loss": 0.0844, - "step": 341 - }, - { - "epoch": 6.218181818181818, - "grad_norm": 2.0702884197235107, - "learning_rate": 8.76e-06, - "loss": 0.0722, - "step": 342 - }, - { - "epoch": 6.236363636363636, - "grad_norm": 2.718057632446289, - "learning_rate": 8.756363636363638e-06, - "loss": 0.0601, - "step": 343 - }, - { - "epoch": 6.254545454545455, - "grad_norm": 4.090023040771484, - "learning_rate": 8.752727272727274e-06, - "loss": 0.0758, - "step": 344 - }, - { - "epoch": 6.2727272727272725, - "grad_norm": 2.067643165588379, - "learning_rate": 8.74909090909091e-06, - "loss": 0.0713, - "step": 345 - }, - { - "epoch": 6.290909090909091, - "grad_norm": 2.2044239044189453, - "learning_rate": 8.745454545454546e-06, - "loss": 0.0626, - "step": 346 - }, - { - "epoch": 6.309090909090909, - "grad_norm": 3.289881944656372, - "learning_rate": 8.741818181818183e-06, - "loss": 0.0697, - "step": 347 - }, - { - "epoch": 6.327272727272727, - "grad_norm": 2.8440468311309814, - "learning_rate": 8.73818181818182e-06, - "loss": 0.0823, - "step": 348 - }, - { - "epoch": 6.345454545454546, - "grad_norm": 4.616799354553223, - "learning_rate": 8.734545454545456e-06, - "loss": 0.0685, - "step": 349 - }, - { - "epoch": 6.363636363636363, - "grad_norm": 4.61152982711792, - "learning_rate": 8.730909090909093e-06, - "loss": 0.0633, - "step": 350 - }, - { - "epoch": 6.381818181818182, - "grad_norm": 0.9026534557342529, - "learning_rate": 8.727272727272728e-06, - "loss": 0.0599, - "step": 351 - }, - { - "epoch": 6.4, - "grad_norm": 2.0948355197906494, - "learning_rate": 8.723636363636365e-06, - "loss": 0.0744, - "step": 352 - }, - { - "epoch": 6.418181818181818, - "grad_norm": 3.6390275955200195, - "learning_rate": 8.720000000000001e-06, - "loss": 0.0734, - "step": 353 - }, - { - "epoch": 6.4363636363636365, - "grad_norm": 0.5999282002449036, - "learning_rate": 8.716363636363638e-06, - "loss": 0.0558, - "step": 354 - }, - { - "epoch": 6.454545454545454, - "grad_norm": 2.62451434135437, - "learning_rate": 8.712727272727275e-06, - "loss": 0.0672, - "step": 355 - }, - { - "epoch": 6.472727272727273, - "grad_norm": 2.598998785018921, - "learning_rate": 8.70909090909091e-06, - "loss": 0.054, - "step": 356 - }, - { - "epoch": 6.490909090909091, - "grad_norm": 0.22191770374774933, - "learning_rate": 8.705454545454546e-06, - "loss": 0.069, - "step": 357 - }, - { - "epoch": 6.509090909090909, - "grad_norm": 1.8839478492736816, - "learning_rate": 8.701818181818183e-06, - "loss": 0.066, - "step": 358 - }, - { - "epoch": 6.527272727272727, - "grad_norm": 1.0882648229599, - "learning_rate": 8.69818181818182e-06, - "loss": 0.051, - "step": 359 - }, - { - "epoch": 6.545454545454545, - "grad_norm": 3.1796767711639404, - "learning_rate": 8.694545454545455e-06, - "loss": 0.0577, - "step": 360 - }, - { - "epoch": 6.5636363636363635, - "grad_norm": 4.426327228546143, - "learning_rate": 8.690909090909091e-06, - "loss": 0.056, - "step": 361 - }, - { - "epoch": 6.581818181818182, - "grad_norm": 0.34447482228279114, - "learning_rate": 8.687272727272728e-06, - "loss": 0.0617, - "step": 362 - }, - { - "epoch": 6.6, - "grad_norm": 1.6324418783187866, - "learning_rate": 8.683636363636365e-06, - "loss": 0.0692, - "step": 363 - }, - { - "epoch": 6.618181818181818, - "grad_norm": 3.390110492706299, - "learning_rate": 8.68e-06, - "loss": 0.08, - "step": 364 - }, - { - "epoch": 6.636363636363637, - "grad_norm": 1.7087422609329224, - "learning_rate": 8.676363636363637e-06, - "loss": 0.0626, - "step": 365 - }, - { - "epoch": 6.654545454545454, - "grad_norm": 1.5722382068634033, - "learning_rate": 8.672727272727273e-06, - "loss": 0.0666, - "step": 366 - }, - { - "epoch": 6.672727272727273, - "grad_norm": 3.5599184036254883, - "learning_rate": 8.66909090909091e-06, - "loss": 0.0742, - "step": 367 - }, - { - "epoch": 6.690909090909091, - "grad_norm": 2.8197669982910156, - "learning_rate": 8.665454545454545e-06, - "loss": 0.0697, - "step": 368 - }, - { - "epoch": 6.709090909090909, - "grad_norm": 0.20425274968147278, - "learning_rate": 8.661818181818182e-06, - "loss": 0.0595, - "step": 369 - }, - { - "epoch": 6.7272727272727275, - "grad_norm": 3.737602949142456, - "learning_rate": 8.658181818181818e-06, - "loss": 0.0716, - "step": 370 - }, - { - "epoch": 6.745454545454545, - "grad_norm": 1.88663649559021, - "learning_rate": 8.654545454545455e-06, - "loss": 0.051, - "step": 371 - }, - { - "epoch": 6.763636363636364, - "grad_norm": 1.9899128675460815, - "learning_rate": 8.650909090909092e-06, - "loss": 0.0724, - "step": 372 - }, - { - "epoch": 6.781818181818182, - "grad_norm": 0.7804005146026611, - "learning_rate": 8.647272727272727e-06, - "loss": 0.0603, - "step": 373 - }, - { - "epoch": 6.8, - "grad_norm": 2.6115078926086426, - "learning_rate": 8.643636363636364e-06, - "loss": 0.0861, - "step": 374 - }, - { - "epoch": 6.818181818181818, - "grad_norm": 3.1797430515289307, - "learning_rate": 8.64e-06, - "loss": 0.0693, - "step": 375 - }, - { - "epoch": 6.836363636363636, - "grad_norm": 4.309091091156006, - "learning_rate": 8.636363636363637e-06, - "loss": 0.0629, - "step": 376 - }, - { - "epoch": 6.8545454545454545, - "grad_norm": 1.4255762100219727, - "learning_rate": 8.632727272727274e-06, - "loss": 0.0784, - "step": 377 - }, - { - "epoch": 6.872727272727273, - "grad_norm": 4.26143217086792, - "learning_rate": 8.62909090909091e-06, - "loss": 0.0706, - "step": 378 - }, - { - "epoch": 6.890909090909091, - "grad_norm": 5.511367321014404, - "learning_rate": 8.625454545454545e-06, - "loss": 0.0703, - "step": 379 - }, - { - "epoch": 6.909090909090909, - "grad_norm": 2.9363088607788086, - "learning_rate": 8.621818181818182e-06, - "loss": 0.0599, - "step": 380 - }, - { - "epoch": 6.927272727272728, - "grad_norm": 1.363046646118164, - "learning_rate": 8.618181818181819e-06, - "loss": 0.0613, - "step": 381 - }, - { - "epoch": 6.945454545454545, - "grad_norm": 3.4964208602905273, - "learning_rate": 8.614545454545456e-06, - "loss": 0.073, - "step": 382 - }, - { - "epoch": 6.963636363636364, - "grad_norm": 0.7610279321670532, - "learning_rate": 8.610909090909092e-06, - "loss": 0.066, - "step": 383 - }, - { - "epoch": 6.9818181818181815, - "grad_norm": 2.2525970935821533, - "learning_rate": 8.607272727272727e-06, - "loss": 0.0525, - "step": 384 - }, - { - "epoch": 7.0, - "grad_norm": 3.7395989894866943, - "learning_rate": 8.603636363636364e-06, - "loss": 0.0708, - "step": 385 - }, - { - "epoch": 7.0, - "eval_loss": 0.062219034880399704, - "eval_runtime": 9.2677, - "eval_samples_per_second": 587.526, - "eval_steps_per_second": 73.481, - "step": 385 - }, - { - "epoch": 7.0181818181818185, - "grad_norm": 4.2018256187438965, - "learning_rate": 8.6e-06, - "loss": 0.0857, - "step": 386 - }, - { - "epoch": 7.036363636363636, - "grad_norm": 3.8699254989624023, - "learning_rate": 8.596363636363637e-06, - "loss": 0.0558, - "step": 387 - }, - { - "epoch": 7.054545454545455, - "grad_norm": 4.983640193939209, - "learning_rate": 8.592727272727274e-06, - "loss": 0.0681, - "step": 388 - }, - { - "epoch": 7.072727272727272, - "grad_norm": 3.9834702014923096, - "learning_rate": 8.58909090909091e-06, - "loss": 0.0673, - "step": 389 - }, - { - "epoch": 7.090909090909091, - "grad_norm": 1.202775478363037, - "learning_rate": 8.585454545454546e-06, - "loss": 0.0497, - "step": 390 - }, - { - "epoch": 7.109090909090909, - "grad_norm": 1.8112752437591553, - "learning_rate": 8.581818181818183e-06, - "loss": 0.0724, - "step": 391 - }, - { - "epoch": 7.127272727272727, - "grad_norm": 0.5086401700973511, - "learning_rate": 8.57818181818182e-06, - "loss": 0.0727, - "step": 392 - }, - { - "epoch": 7.1454545454545455, - "grad_norm": 1.5880770683288574, - "learning_rate": 8.574545454545456e-06, - "loss": 0.0581, - "step": 393 - }, - { - "epoch": 7.163636363636364, - "grad_norm": 2.3417906761169434, - "learning_rate": 8.570909090909093e-06, - "loss": 0.0619, - "step": 394 - }, - { - "epoch": 7.181818181818182, - "grad_norm": 1.8647217750549316, - "learning_rate": 8.56727272727273e-06, - "loss": 0.074, - "step": 395 - }, - { - "epoch": 7.2, - "grad_norm": 2.618100166320801, - "learning_rate": 8.563636363636364e-06, - "loss": 0.0675, - "step": 396 - }, - { - "epoch": 7.218181818181818, - "grad_norm": 2.536832809448242, - "learning_rate": 8.560000000000001e-06, - "loss": 0.0757, - "step": 397 - }, - { - "epoch": 7.236363636363636, - "grad_norm": 0.6125643253326416, - "learning_rate": 8.556363636363638e-06, - "loss": 0.058, - "step": 398 - }, - { - "epoch": 7.254545454545455, - "grad_norm": 1.9281374216079712, - "learning_rate": 8.552727272727274e-06, - "loss": 0.0755, - "step": 399 - }, - { - "epoch": 7.2727272727272725, - "grad_norm": 0.7653562426567078, - "learning_rate": 8.54909090909091e-06, - "loss": 0.0593, - "step": 400 - }, - { - "epoch": 7.290909090909091, - "grad_norm": 1.9463859796524048, - "learning_rate": 8.545454545454546e-06, - "loss": 0.0558, - "step": 401 - }, - { - "epoch": 7.309090909090909, - "grad_norm": 2.494232654571533, - "learning_rate": 8.541818181818183e-06, - "loss": 0.0609, - "step": 402 - }, - { - "epoch": 7.327272727272727, - "grad_norm": 2.4146039485931396, - "learning_rate": 8.53818181818182e-06, - "loss": 0.0595, - "step": 403 - }, - { - "epoch": 7.345454545454546, - "grad_norm": 0.3921318054199219, - "learning_rate": 8.534545454545455e-06, - "loss": 0.0695, - "step": 404 - }, - { - "epoch": 7.363636363636363, - "grad_norm": 2.6273839473724365, - "learning_rate": 8.530909090909091e-06, - "loss": 0.0807, - "step": 405 - }, - { - "epoch": 7.381818181818182, - "grad_norm": 2.284071922302246, - "learning_rate": 8.527272727272728e-06, - "loss": 0.0564, - "step": 406 - }, - { - "epoch": 7.4, - "grad_norm": 1.8461284637451172, - "learning_rate": 8.523636363636365e-06, - "loss": 0.0473, - "step": 407 - }, - { - "epoch": 7.418181818181818, - "grad_norm": 2.694505453109741, - "learning_rate": 8.52e-06, - "loss": 0.0535, - "step": 408 - }, - { - "epoch": 7.4363636363636365, - "grad_norm": 1.748871088027954, - "learning_rate": 8.516363636363636e-06, - "loss": 0.0574, - "step": 409 - }, - { - "epoch": 7.454545454545454, - "grad_norm": 2.933605194091797, - "learning_rate": 8.512727272727273e-06, - "loss": 0.0675, - "step": 410 - }, - { - "epoch": 7.472727272727273, - "grad_norm": 4.155369758605957, - "learning_rate": 8.50909090909091e-06, - "loss": 0.0537, - "step": 411 - }, - { - "epoch": 7.490909090909091, - "grad_norm": 3.2797601222991943, - "learning_rate": 8.505454545454545e-06, - "loss": 0.062, - "step": 412 - }, - { - "epoch": 7.509090909090909, - "grad_norm": 0.8535061478614807, - "learning_rate": 8.501818181818182e-06, - "loss": 0.0533, - "step": 413 - }, - { - "epoch": 7.527272727272727, - "grad_norm": 2.5627615451812744, - "learning_rate": 8.498181818181818e-06, - "loss": 0.044, - "step": 414 - }, - { - "epoch": 7.545454545454545, - "grad_norm": 2.2128312587738037, - "learning_rate": 8.494545454545455e-06, - "loss": 0.0758, - "step": 415 - }, - { - "epoch": 7.5636363636363635, - "grad_norm": 1.6620253324508667, - "learning_rate": 8.490909090909092e-06, - "loss": 0.0662, - "step": 416 - }, - { - "epoch": 7.581818181818182, - "grad_norm": 2.95094895362854, - "learning_rate": 8.487272727272728e-06, - "loss": 0.0742, - "step": 417 - }, - { - "epoch": 7.6, - "grad_norm": 1.5823708772659302, - "learning_rate": 8.483636363636363e-06, - "loss": 0.0549, - "step": 418 - }, - { - "epoch": 7.618181818181818, - "grad_norm": 3.4548749923706055, - "learning_rate": 8.48e-06, - "loss": 0.0723, - "step": 419 - }, - { - "epoch": 7.636363636363637, - "grad_norm": 3.996311902999878, - "learning_rate": 8.476363636363637e-06, - "loss": 0.0715, - "step": 420 - }, - { - "epoch": 7.654545454545454, - "grad_norm": 2.73309063911438, - "learning_rate": 8.472727272727274e-06, - "loss": 0.0701, - "step": 421 - }, - { - "epoch": 7.672727272727273, - "grad_norm": 2.0375096797943115, - "learning_rate": 8.46909090909091e-06, - "loss": 0.0703, - "step": 422 - }, - { - "epoch": 7.690909090909091, - "grad_norm": 2.531511068344116, - "learning_rate": 8.465454545454547e-06, - "loss": 0.0601, - "step": 423 - }, - { - "epoch": 7.709090909090909, - "grad_norm": 0.21027733385562897, - "learning_rate": 8.461818181818182e-06, - "loss": 0.0652, - "step": 424 - }, - { - "epoch": 7.7272727272727275, - "grad_norm": 1.5353460311889648, - "learning_rate": 8.458181818181819e-06, - "loss": 0.0632, - "step": 425 - }, - { - "epoch": 7.745454545454545, - "grad_norm": 1.0163030624389648, - "learning_rate": 8.454545454545455e-06, - "loss": 0.073, - "step": 426 - }, - { - "epoch": 7.763636363636364, - "grad_norm": 0.7413188219070435, - "learning_rate": 8.450909090909092e-06, - "loss": 0.0663, - "step": 427 - }, - { - "epoch": 7.781818181818182, - "grad_norm": 0.931667149066925, - "learning_rate": 8.447272727272729e-06, - "loss": 0.0567, - "step": 428 - }, - { - "epoch": 7.8, - "grad_norm": 2.4197258949279785, - "learning_rate": 8.443636363636364e-06, - "loss": 0.0635, - "step": 429 - }, - { - "epoch": 7.818181818181818, - "grad_norm": 2.299356698989868, - "learning_rate": 8.44e-06, - "loss": 0.0753, - "step": 430 - }, - { - "epoch": 7.836363636363636, - "grad_norm": 1.914243221282959, - "learning_rate": 8.436363636363637e-06, - "loss": 0.0627, - "step": 431 - }, - { - "epoch": 7.8545454545454545, - "grad_norm": 2.0206756591796875, - "learning_rate": 8.432727272727274e-06, - "loss": 0.0758, - "step": 432 - }, - { - "epoch": 7.872727272727273, - "grad_norm": 2.1410157680511475, - "learning_rate": 8.42909090909091e-06, - "loss": 0.0729, - "step": 433 - }, - { - "epoch": 7.890909090909091, - "grad_norm": 1.866721272468567, - "learning_rate": 8.425454545454547e-06, - "loss": 0.0575, - "step": 434 - }, - { - "epoch": 7.909090909090909, - "grad_norm": 1.4705517292022705, - "learning_rate": 8.421818181818182e-06, - "loss": 0.0645, - "step": 435 - }, - { - "epoch": 7.927272727272728, - "grad_norm": 0.24675053358078003, - "learning_rate": 8.418181818181819e-06, - "loss": 0.0725, - "step": 436 - }, - { - "epoch": 7.945454545454545, - "grad_norm": 0.586995542049408, - "learning_rate": 8.414545454545456e-06, - "loss": 0.0717, - "step": 437 - }, - { - "epoch": 7.963636363636364, - "grad_norm": 2.399186372756958, - "learning_rate": 8.410909090909092e-06, - "loss": 0.0884, - "step": 438 - }, - { - "epoch": 7.9818181818181815, - "grad_norm": 0.9824120998382568, - "learning_rate": 8.40727272727273e-06, - "loss": 0.0711, - "step": 439 - }, - { - "epoch": 8.0, - "grad_norm": 0.6309399008750916, - "learning_rate": 8.403636363636364e-06, - "loss": 0.0702, - "step": 440 - }, - { - "epoch": 8.0, - "eval_loss": 0.06541124731302261, - "eval_runtime": 9.0581, - "eval_samples_per_second": 601.121, - "eval_steps_per_second": 75.181, - "step": 440 - }, - { - "epoch": 8.018181818181818, - "grad_norm": 3.446725845336914, - "learning_rate": 8.400000000000001e-06, - "loss": 0.0669, - "step": 441 - }, - { - "epoch": 8.036363636363637, - "grad_norm": 3.7788219451904297, - "learning_rate": 8.396363636363638e-06, - "loss": 0.0793, - "step": 442 - }, - { - "epoch": 8.054545454545455, - "grad_norm": 2.0215859413146973, - "learning_rate": 8.392727272727274e-06, - "loss": 0.0619, - "step": 443 - }, - { - "epoch": 8.072727272727272, - "grad_norm": 1.473349928855896, - "learning_rate": 8.38909090909091e-06, - "loss": 0.071, - "step": 444 - }, - { - "epoch": 8.090909090909092, - "grad_norm": 1.8147509098052979, - "learning_rate": 8.385454545454546e-06, - "loss": 0.0621, - "step": 445 - }, - { - "epoch": 8.10909090909091, - "grad_norm": 0.18992021679878235, - "learning_rate": 8.381818181818183e-06, - "loss": 0.0599, - "step": 446 - }, - { - "epoch": 8.127272727272727, - "grad_norm": 0.23985564708709717, - "learning_rate": 8.37818181818182e-06, - "loss": 0.0592, - "step": 447 - }, - { - "epoch": 8.145454545454545, - "grad_norm": 1.1715706586837769, - "learning_rate": 8.374545454545454e-06, - "loss": 0.069, - "step": 448 - }, - { - "epoch": 8.163636363636364, - "grad_norm": 2.470081329345703, - "learning_rate": 8.370909090909091e-06, - "loss": 0.0817, - "step": 449 - }, - { - "epoch": 8.181818181818182, - "grad_norm": 0.1947249174118042, - "learning_rate": 8.367272727272728e-06, - "loss": 0.0818, - "step": 450 - }, - { - "epoch": 8.2, - "grad_norm": 0.7564553022384644, - "learning_rate": 8.363636363636365e-06, - "loss": 0.0705, - "step": 451 - }, - { - "epoch": 8.218181818181819, - "grad_norm": 0.2035110592842102, - "learning_rate": 8.36e-06, - "loss": 0.0602, - "step": 452 - }, - { - "epoch": 8.236363636363636, - "grad_norm": 3.4760782718658447, - "learning_rate": 8.356363636363636e-06, - "loss": 0.0782, - "step": 453 - }, - { - "epoch": 8.254545454545454, - "grad_norm": 0.5014008283615112, - "learning_rate": 8.352727272727273e-06, - "loss": 0.0438, - "step": 454 - }, - { - "epoch": 8.272727272727273, - "grad_norm": 0.40681782364845276, - "learning_rate": 8.34909090909091e-06, - "loss": 0.0578, - "step": 455 - }, - { - "epoch": 8.290909090909091, - "grad_norm": 1.0162789821624756, - "learning_rate": 8.345454545454546e-06, - "loss": 0.06, - "step": 456 - }, - { - "epoch": 8.309090909090909, - "grad_norm": 2.611849546432495, - "learning_rate": 8.341818181818181e-06, - "loss": 0.0511, - "step": 457 - }, - { - "epoch": 8.327272727272728, - "grad_norm": 2.1867620944976807, - "learning_rate": 8.338181818181818e-06, - "loss": 0.0484, - "step": 458 - }, - { - "epoch": 8.345454545454546, - "grad_norm": 0.3404158055782318, - "learning_rate": 8.334545454545455e-06, - "loss": 0.0663, - "step": 459 - }, - { - "epoch": 8.363636363636363, - "grad_norm": 3.2498273849487305, - "learning_rate": 8.330909090909092e-06, - "loss": 0.0663, - "step": 460 - }, - { - "epoch": 8.381818181818183, - "grad_norm": 2.0355019569396973, - "learning_rate": 8.327272727272728e-06, - "loss": 0.0651, - "step": 461 - }, - { - "epoch": 8.4, - "grad_norm": 1.7504990100860596, - "learning_rate": 8.323636363636365e-06, - "loss": 0.0633, - "step": 462 - }, - { - "epoch": 8.418181818181818, - "grad_norm": 1.168997883796692, - "learning_rate": 8.32e-06, - "loss": 0.0599, - "step": 463 - }, - { - "epoch": 8.436363636363636, - "grad_norm": 1.0976821184158325, - "learning_rate": 8.316363636363637e-06, - "loss": 0.0469, - "step": 464 - }, - { - "epoch": 8.454545454545455, - "grad_norm": 4.218688011169434, - "learning_rate": 8.312727272727273e-06, - "loss": 0.0756, - "step": 465 - }, - { - "epoch": 8.472727272727273, - "grad_norm": 5.447925090789795, - "learning_rate": 8.30909090909091e-06, - "loss": 0.073, - "step": 466 - }, - { - "epoch": 8.49090909090909, - "grad_norm": 5.7156596183776855, - "learning_rate": 8.305454545454547e-06, - "loss": 0.0871, - "step": 467 - }, - { - "epoch": 8.50909090909091, - "grad_norm": 0.28243112564086914, - "learning_rate": 8.301818181818182e-06, - "loss": 0.0512, - "step": 468 - }, - { - "epoch": 8.527272727272727, - "grad_norm": 3.071674346923828, - "learning_rate": 8.298181818181819e-06, - "loss": 0.0764, - "step": 469 - }, - { - "epoch": 8.545454545454545, - "grad_norm": 5.203369140625, - "learning_rate": 8.294545454545455e-06, - "loss": 0.0644, - "step": 470 - }, - { - "epoch": 8.563636363636364, - "grad_norm": 2.6375560760498047, - "learning_rate": 8.290909090909092e-06, - "loss": 0.0735, - "step": 471 - }, - { - "epoch": 8.581818181818182, - "grad_norm": 2.0718421936035156, - "learning_rate": 8.287272727272729e-06, - "loss": 0.0743, - "step": 472 - }, - { - "epoch": 8.6, - "grad_norm": 2.699130058288574, - "learning_rate": 8.283636363636365e-06, - "loss": 0.0866, - "step": 473 - }, - { - "epoch": 8.618181818181817, - "grad_norm": 1.4964139461517334, - "learning_rate": 8.28e-06, - "loss": 0.0597, - "step": 474 - }, - { - "epoch": 8.636363636363637, - "grad_norm": 3.148054361343384, - "learning_rate": 8.276363636363637e-06, - "loss": 0.0537, - "step": 475 - }, - { - "epoch": 8.654545454545454, - "grad_norm": 5.4403462409973145, - "learning_rate": 8.272727272727274e-06, - "loss": 0.0662, - "step": 476 - }, - { - "epoch": 8.672727272727272, - "grad_norm": 3.2796781063079834, - "learning_rate": 8.26909090909091e-06, - "loss": 0.0802, - "step": 477 - }, - { - "epoch": 8.690909090909091, - "grad_norm": 1.3469656705856323, - "learning_rate": 8.265454545454547e-06, - "loss": 0.0607, - "step": 478 - }, - { - "epoch": 8.709090909090909, - "grad_norm": 2.2391483783721924, - "learning_rate": 8.261818181818182e-06, - "loss": 0.0596, - "step": 479 - }, - { - "epoch": 8.727272727272727, - "grad_norm": 0.5589116215705872, - "learning_rate": 8.258181818181819e-06, - "loss": 0.0606, - "step": 480 - }, - { - "epoch": 8.745454545454546, - "grad_norm": 2.9762935638427734, - "learning_rate": 8.254545454545456e-06, - "loss": 0.0689, - "step": 481 - }, - { - "epoch": 8.763636363636364, - "grad_norm": 1.0212925672531128, - "learning_rate": 8.250909090909092e-06, - "loss": 0.0671, - "step": 482 - }, - { - "epoch": 8.781818181818181, - "grad_norm": 1.2807869911193848, - "learning_rate": 8.247272727272729e-06, - "loss": 0.0543, - "step": 483 - }, - { - "epoch": 8.8, - "grad_norm": 2.9307937622070312, - "learning_rate": 8.243636363636364e-06, - "loss": 0.0893, - "step": 484 - }, - { - "epoch": 8.818181818181818, - "grad_norm": 1.564207673072815, - "learning_rate": 8.24e-06, - "loss": 0.053, - "step": 485 - }, - { - "epoch": 8.836363636363636, - "grad_norm": 0.44221898913383484, - "learning_rate": 8.236363636363637e-06, - "loss": 0.0717, - "step": 486 - }, - { - "epoch": 8.854545454545455, - "grad_norm": 0.3093806803226471, - "learning_rate": 8.232727272727274e-06, - "loss": 0.0687, - "step": 487 - }, - { - "epoch": 8.872727272727273, - "grad_norm": 0.36967527866363525, - "learning_rate": 8.22909090909091e-06, - "loss": 0.0564, - "step": 488 - }, - { - "epoch": 8.89090909090909, - "grad_norm": 1.0119107961654663, - "learning_rate": 8.225454545454546e-06, - "loss": 0.0718, - "step": 489 - }, - { - "epoch": 8.909090909090908, - "grad_norm": 1.5993187427520752, - "learning_rate": 8.221818181818183e-06, - "loss": 0.0723, - "step": 490 - }, - { - "epoch": 8.927272727272728, - "grad_norm": 1.4259636402130127, - "learning_rate": 8.21818181818182e-06, - "loss": 0.0613, - "step": 491 - }, - { - "epoch": 8.945454545454545, - "grad_norm": 1.9773681163787842, - "learning_rate": 8.214545454545454e-06, - "loss": 0.0552, - "step": 492 - }, - { - "epoch": 8.963636363636363, - "grad_norm": 2.0938880443573, - "learning_rate": 8.210909090909091e-06, - "loss": 0.0571, - "step": 493 - }, - { - "epoch": 8.981818181818182, - "grad_norm": 0.488280713558197, - "learning_rate": 8.207272727272728e-06, - "loss": 0.0638, - "step": 494 - }, - { - "epoch": 9.0, - "grad_norm": 3.7130565643310547, - "learning_rate": 8.203636363636364e-06, - "loss": 0.056, - "step": 495 - }, - { - "epoch": 9.0, - "eval_loss": 0.06421126425266266, - "eval_runtime": 8.3778, - "eval_samples_per_second": 649.928, - "eval_steps_per_second": 81.286, - "step": 495 - }, - { - "epoch": 9.018181818181818, - "grad_norm": 4.112293720245361, - "learning_rate": 8.2e-06, - "loss": 0.0592, - "step": 496 - }, - { - "epoch": 9.036363636363637, - "grad_norm": 0.21660329401493073, - "learning_rate": 8.196363636363636e-06, - "loss": 0.0657, - "step": 497 - }, - { - "epoch": 9.054545454545455, - "grad_norm": 3.9101672172546387, - "learning_rate": 8.192727272727273e-06, - "loss": 0.074, - "step": 498 - }, - { - "epoch": 9.072727272727272, - "grad_norm": 2.7318665981292725, - "learning_rate": 8.18909090909091e-06, - "loss": 0.0726, - "step": 499 - }, - { - "epoch": 9.090909090909092, - "grad_norm": 3.263549327850342, - "learning_rate": 8.185454545454546e-06, - "loss": 0.0574, - "step": 500 - }, - { - "epoch": 9.10909090909091, - "grad_norm": 3.5308995246887207, - "learning_rate": 8.181818181818183e-06, - "loss": 0.0597, - "step": 501 - }, - { - "epoch": 9.127272727272727, - "grad_norm": 0.777102530002594, - "learning_rate": 8.178181818181818e-06, - "loss": 0.0799, - "step": 502 - }, - { - "epoch": 9.145454545454545, - "grad_norm": 0.89207524061203, - "learning_rate": 8.174545454545455e-06, - "loss": 0.0634, - "step": 503 - }, - { - "epoch": 9.163636363636364, - "grad_norm": 3.418482780456543, - "learning_rate": 8.170909090909091e-06, - "loss": 0.0884, - "step": 504 - }, - { - "epoch": 9.181818181818182, - "grad_norm": 3.1419575214385986, - "learning_rate": 8.167272727272728e-06, - "loss": 0.0761, - "step": 505 - }, - { - "epoch": 9.2, - "grad_norm": 0.5595026612281799, - "learning_rate": 8.163636363636365e-06, - "loss": 0.0785, - "step": 506 - }, - { - "epoch": 9.218181818181819, - "grad_norm": 2.4327001571655273, - "learning_rate": 8.16e-06, - "loss": 0.0579, - "step": 507 - }, - { - "epoch": 9.236363636363636, - "grad_norm": 0.6497751474380493, - "learning_rate": 8.156363636363637e-06, - "loss": 0.058, - "step": 508 - }, - { - "epoch": 9.254545454545454, - "grad_norm": 2.473426580429077, - "learning_rate": 8.152727272727273e-06, - "loss": 0.054, - "step": 509 - }, - { - "epoch": 9.272727272727273, - "grad_norm": 2.0657756328582764, - "learning_rate": 8.14909090909091e-06, - "loss": 0.0853, - "step": 510 - }, - { - "epoch": 9.290909090909091, - "grad_norm": 0.7585165500640869, - "learning_rate": 8.145454545454547e-06, - "loss": 0.0706, - "step": 511 - }, - { - "epoch": 9.309090909090909, - "grad_norm": 4.418927192687988, - "learning_rate": 8.141818181818183e-06, - "loss": 0.0665, - "step": 512 - }, - { - "epoch": 9.327272727272728, - "grad_norm": 7.641233444213867, - "learning_rate": 8.138181818181818e-06, - "loss": 0.0801, - "step": 513 - }, - { - "epoch": 9.345454545454546, - "grad_norm": 5.358871936798096, - "learning_rate": 8.134545454545455e-06, - "loss": 0.0665, - "step": 514 - }, - { - "epoch": 9.363636363636363, - "grad_norm": 0.37424203753471375, - "learning_rate": 8.130909090909092e-06, - "loss": 0.0614, - "step": 515 - }, - { - "epoch": 9.381818181818183, - "grad_norm": 3.7350425720214844, - "learning_rate": 8.127272727272728e-06, - "loss": 0.0693, - "step": 516 - }, - { - "epoch": 9.4, - "grad_norm": 6.881044864654541, - "learning_rate": 8.123636363636365e-06, - "loss": 0.0989, - "step": 517 - }, - { - "epoch": 9.418181818181818, - "grad_norm": 5.155004978179932, - "learning_rate": 8.120000000000002e-06, - "loss": 0.0844, - "step": 518 - }, - { - "epoch": 9.436363636363636, - "grad_norm": 0.5640428066253662, - "learning_rate": 8.116363636363637e-06, - "loss": 0.0434, - "step": 519 - }, - { - "epoch": 9.454545454545455, - "grad_norm": 4.371345520019531, - "learning_rate": 8.112727272727274e-06, - "loss": 0.0712, - "step": 520 - }, - { - "epoch": 9.472727272727273, - "grad_norm": 6.635160446166992, - "learning_rate": 8.10909090909091e-06, - "loss": 0.0717, - "step": 521 - }, - { - "epoch": 9.49090909090909, - "grad_norm": 5.827282428741455, - "learning_rate": 8.105454545454547e-06, - "loss": 0.0689, - "step": 522 - }, - { - "epoch": 9.50909090909091, - "grad_norm": 1.4285904169082642, - "learning_rate": 8.101818181818184e-06, - "loss": 0.0632, - "step": 523 - }, - { - "epoch": 9.527272727272727, - "grad_norm": 4.1109137535095215, - "learning_rate": 8.098181818181819e-06, - "loss": 0.0518, - "step": 524 - }, - { - "epoch": 9.545454545454545, - "grad_norm": 6.655120372772217, - "learning_rate": 8.094545454545455e-06, - "loss": 0.0571, - "step": 525 - }, - { - "epoch": 9.563636363636364, - "grad_norm": 7.212472915649414, - "learning_rate": 8.090909090909092e-06, - "loss": 0.0913, - "step": 526 - }, - { - "epoch": 9.581818181818182, - "grad_norm": 2.827617883682251, - "learning_rate": 8.087272727272729e-06, - "loss": 0.0564, - "step": 527 - }, - { - "epoch": 9.6, - "grad_norm": 1.735584020614624, - "learning_rate": 8.083636363636364e-06, - "loss": 0.071, - "step": 528 - }, - { - "epoch": 9.618181818181817, - "grad_norm": 4.857851028442383, - "learning_rate": 8.08e-06, - "loss": 0.0676, - "step": 529 - }, - { - "epoch": 9.636363636363637, - "grad_norm": 3.5473153591156006, - "learning_rate": 8.076363636363637e-06, - "loss": 0.063, - "step": 530 - }, - { - "epoch": 9.654545454545454, - "grad_norm": 1.8591941595077515, - "learning_rate": 8.072727272727274e-06, - "loss": 0.069, - "step": 531 - }, - { - "epoch": 9.672727272727272, - "grad_norm": 0.8483375906944275, - "learning_rate": 8.069090909090909e-06, - "loss": 0.0553, - "step": 532 - }, - { - "epoch": 9.690909090909091, - "grad_norm": 2.232304811477661, - "learning_rate": 8.065454545454546e-06, - "loss": 0.0653, - "step": 533 - }, - { - "epoch": 9.709090909090909, - "grad_norm": 0.2513748109340668, - "learning_rate": 8.061818181818182e-06, - "loss": 0.073, - "step": 534 - }, - { - "epoch": 9.727272727272727, - "grad_norm": 0.7919653654098511, - "learning_rate": 8.058181818181819e-06, - "loss": 0.0636, - "step": 535 - }, - { - "epoch": 9.745454545454546, - "grad_norm": 0.3363504111766815, - "learning_rate": 8.054545454545454e-06, - "loss": 0.0542, - "step": 536 - }, - { - "epoch": 9.763636363636364, - "grad_norm": 0.8563377261161804, - "learning_rate": 8.05090909090909e-06, - "loss": 0.0674, - "step": 537 - }, - { - "epoch": 9.781818181818181, - "grad_norm": 2.4276349544525146, - "learning_rate": 8.047272727272728e-06, - "loss": 0.0537, - "step": 538 - }, - { - "epoch": 9.8, - "grad_norm": 1.237955927848816, - "learning_rate": 8.043636363636364e-06, - "loss": 0.0633, - "step": 539 - }, - { - "epoch": 9.818181818181818, - "grad_norm": 2.669100761413574, - "learning_rate": 8.040000000000001e-06, - "loss": 0.0662, - "step": 540 - }, - { - "epoch": 9.836363636363636, - "grad_norm": 3.0456113815307617, - "learning_rate": 8.036363636363636e-06, - "loss": 0.073, - "step": 541 - }, - { - "epoch": 9.854545454545455, - "grad_norm": 0.6336878538131714, - "learning_rate": 8.032727272727273e-06, - "loss": 0.0914, - "step": 542 - }, - { - "epoch": 9.872727272727273, - "grad_norm": 4.742973327636719, - "learning_rate": 8.02909090909091e-06, - "loss": 0.0636, - "step": 543 - }, - { - "epoch": 9.89090909090909, - "grad_norm": 3.9023191928863525, - "learning_rate": 8.025454545454546e-06, - "loss": 0.0742, - "step": 544 - }, - { - "epoch": 9.909090909090908, - "grad_norm": 2.8903048038482666, - "learning_rate": 8.021818181818183e-06, - "loss": 0.0536, - "step": 545 - }, - { - "epoch": 9.927272727272728, - "grad_norm": 1.7286165952682495, - "learning_rate": 8.018181818181818e-06, - "loss": 0.0553, - "step": 546 - }, - { - "epoch": 9.945454545454545, - "grad_norm": 4.09605073928833, - "learning_rate": 8.014545454545455e-06, - "loss": 0.0681, - "step": 547 - }, - { - "epoch": 9.963636363636363, - "grad_norm": 2.231682300567627, - "learning_rate": 8.010909090909091e-06, - "loss": 0.0559, - "step": 548 - }, - { - "epoch": 9.981818181818182, - "grad_norm": 2.8798108100891113, - "learning_rate": 8.007272727272728e-06, - "loss": 0.0449, - "step": 549 - }, - { - "epoch": 10.0, - "grad_norm": 4.035531044006348, - "learning_rate": 8.003636363636365e-06, - "loss": 0.0604, - "step": 550 - }, - { - "epoch": 10.0, - "eval_loss": 0.06064901500940323, - "eval_runtime": 8.6122, - "eval_samples_per_second": 632.246, - "eval_steps_per_second": 79.074, - "step": 550 - }, - { - "epoch": 10.018181818181818, - "grad_norm": 1.5404729843139648, - "learning_rate": 8.000000000000001e-06, - "loss": 0.0482, - "step": 551 - }, - { - "epoch": 10.036363636363637, - "grad_norm": 3.114063262939453, - "learning_rate": 7.996363636363636e-06, - "loss": 0.0473, - "step": 552 - }, - { - "epoch": 10.054545454545455, - "grad_norm": 5.846924781799316, - "learning_rate": 7.992727272727273e-06, - "loss": 0.0746, - "step": 553 - }, - { - "epoch": 10.072727272727272, - "grad_norm": 3.3693761825561523, - "learning_rate": 7.98909090909091e-06, - "loss": 0.0701, - "step": 554 - }, - { - "epoch": 10.090909090909092, - "grad_norm": 1.436720371246338, - "learning_rate": 7.985454545454546e-06, - "loss": 0.0776, - "step": 555 - }, - { - "epoch": 10.10909090909091, - "grad_norm": 5.572106838226318, - "learning_rate": 7.981818181818183e-06, - "loss": 0.0751, - "step": 556 - }, - { - "epoch": 10.127272727272727, - "grad_norm": 9.685996055603027, - "learning_rate": 7.97818181818182e-06, - "loss": 0.093, - "step": 557 - }, - { - "epoch": 10.145454545454545, - "grad_norm": 8.939018249511719, - "learning_rate": 7.974545454545455e-06, - "loss": 0.0947, - "step": 558 - }, - { - "epoch": 10.163636363636364, - "grad_norm": 4.770912170410156, - "learning_rate": 7.970909090909092e-06, - "loss": 0.082, - "step": 559 - }, - { - "epoch": 10.181818181818182, - "grad_norm": 1.110264539718628, - "learning_rate": 7.967272727272728e-06, - "loss": 0.0552, - "step": 560 - }, - { - "epoch": 10.2, - "grad_norm": 6.9684858322143555, - "learning_rate": 7.963636363636365e-06, - "loss": 0.0736, - "step": 561 - }, - { - "epoch": 10.218181818181819, - "grad_norm": 11.681800842285156, - "learning_rate": 7.960000000000002e-06, - "loss": 0.1251, - "step": 562 - }, - { - "epoch": 10.236363636363636, - "grad_norm": 11.092482566833496, - "learning_rate": 7.956363636363637e-06, - "loss": 0.0995, - "step": 563 - }, - { - "epoch": 10.254545454545454, - "grad_norm": 9.022331237792969, - "learning_rate": 7.952727272727273e-06, - "loss": 0.0816, - "step": 564 - }, - { - "epoch": 10.272727272727273, - "grad_norm": 4.717870712280273, - "learning_rate": 7.94909090909091e-06, - "loss": 0.072, - "step": 565 - }, - { - "epoch": 10.290909090909091, - "grad_norm": 1.3735060691833496, - "learning_rate": 7.945454545454547e-06, - "loss": 0.0653, - "step": 566 - }, - { - "epoch": 10.309090909090909, - "grad_norm": 4.640903472900391, - "learning_rate": 7.941818181818184e-06, - "loss": 0.0683, - "step": 567 - }, - { - "epoch": 10.327272727272728, - "grad_norm": 3.9371042251586914, - "learning_rate": 7.938181818181819e-06, - "loss": 0.0562, - "step": 568 - }, - { - "epoch": 10.345454545454546, - "grad_norm": 1.6011143922805786, - "learning_rate": 7.934545454545455e-06, - "loss": 0.0731, - "step": 569 - }, - { - "epoch": 10.363636363636363, - "grad_norm": 2.02890944480896, - "learning_rate": 7.930909090909092e-06, - "loss": 0.0884, - "step": 570 - }, - { - "epoch": 10.381818181818183, - "grad_norm": 2.6308634281158447, - "learning_rate": 7.927272727272729e-06, - "loss": 0.0693, - "step": 571 - }, - { - "epoch": 10.4, - "grad_norm": 3.121800422668457, - "learning_rate": 7.923636363636364e-06, - "loss": 0.0619, - "step": 572 - }, - { - "epoch": 10.418181818181818, - "grad_norm": 0.6850519776344299, - "learning_rate": 7.92e-06, - "loss": 0.0708, - "step": 573 - }, - { - "epoch": 10.436363636363636, - "grad_norm": 2.9358580112457275, - "learning_rate": 7.916363636363637e-06, - "loss": 0.0654, - "step": 574 - }, - { - "epoch": 10.454545454545455, - "grad_norm": 0.8702022433280945, - "learning_rate": 7.912727272727274e-06, - "loss": 0.0745, - "step": 575 - }, - { - "epoch": 10.472727272727273, - "grad_norm": 1.8396024703979492, - "learning_rate": 7.909090909090909e-06, - "loss": 0.0633, - "step": 576 - }, - { - "epoch": 10.49090909090909, - "grad_norm": 0.48891106247901917, - "learning_rate": 7.905454545454546e-06, - "loss": 0.0809, - "step": 577 - }, - { - "epoch": 10.50909090909091, - "grad_norm": 0.6508380174636841, - "learning_rate": 7.901818181818182e-06, - "loss": 0.0619, - "step": 578 - }, - { - "epoch": 10.527272727272727, - "grad_norm": 1.9046374559402466, - "learning_rate": 7.898181818181819e-06, - "loss": 0.0588, - "step": 579 - }, - { - "epoch": 10.545454545454545, - "grad_norm": 1.3191097974777222, - "learning_rate": 7.894545454545454e-06, - "loss": 0.0641, - "step": 580 - }, - { - "epoch": 10.563636363636364, - "grad_norm": 1.9656713008880615, - "learning_rate": 7.89090909090909e-06, - "loss": 0.073, - "step": 581 - }, - { - "epoch": 10.581818181818182, - "grad_norm": 3.7868380546569824, - "learning_rate": 7.887272727272727e-06, - "loss": 0.0791, - "step": 582 - }, - { - "epoch": 10.6, - "grad_norm": 1.6309148073196411, - "learning_rate": 7.883636363636364e-06, - "loss": 0.0599, - "step": 583 - }, - { - "epoch": 10.618181818181817, - "grad_norm": 1.633113980293274, - "learning_rate": 7.88e-06, - "loss": 0.0569, - "step": 584 - }, - { - "epoch": 10.636363636363637, - "grad_norm": 3.783313751220703, - "learning_rate": 7.876363636363637e-06, - "loss": 0.0656, - "step": 585 - }, - { - "epoch": 10.654545454545454, - "grad_norm": 3.3318347930908203, - "learning_rate": 7.872727272727273e-06, - "loss": 0.0586, - "step": 586 - }, - { - "epoch": 10.672727272727272, - "grad_norm": 1.6379303932189941, - "learning_rate": 7.86909090909091e-06, - "loss": 0.057, - "step": 587 - }, - { - "epoch": 10.690909090909091, - "grad_norm": 2.8293278217315674, - "learning_rate": 7.865454545454546e-06, - "loss": 0.0605, - "step": 588 - }, - { - "epoch": 10.709090909090909, - "grad_norm": 1.4704664945602417, - "learning_rate": 7.861818181818183e-06, - "loss": 0.064, - "step": 589 - }, - { - "epoch": 10.727272727272727, - "grad_norm": 0.4450334906578064, - "learning_rate": 7.85818181818182e-06, - "loss": 0.0609, - "step": 590 - }, - { - "epoch": 10.745454545454546, - "grad_norm": 2.7354488372802734, - "learning_rate": 7.854545454545454e-06, - "loss": 0.0555, - "step": 591 - }, - { - "epoch": 10.763636363636364, - "grad_norm": 1.5932197570800781, - "learning_rate": 7.850909090909091e-06, - "loss": 0.0651, - "step": 592 - }, - { - "epoch": 10.781818181818181, - "grad_norm": 2.4741339683532715, - "learning_rate": 7.847272727272728e-06, - "loss": 0.0363, - "step": 593 - }, - { - "epoch": 10.8, - "grad_norm": 3.246129274368286, - "learning_rate": 7.843636363636364e-06, - "loss": 0.0581, - "step": 594 - }, - { - "epoch": 10.818181818181818, - "grad_norm": 0.36132627725601196, - "learning_rate": 7.840000000000001e-06, - "loss": 0.0857, - "step": 595 - }, - { - "epoch": 10.836363636363636, - "grad_norm": 1.4849697351455688, - "learning_rate": 7.836363636363638e-06, - "loss": 0.0532, - "step": 596 - }, - { - "epoch": 10.854545454545455, - "grad_norm": 2.342519521713257, - "learning_rate": 7.832727272727273e-06, - "loss": 0.0523, - "step": 597 - }, - { - "epoch": 10.872727272727273, - "grad_norm": 0.4584204852581024, - "learning_rate": 7.82909090909091e-06, - "loss": 0.0515, - "step": 598 - }, - { - "epoch": 10.89090909090909, - "grad_norm": 1.1963205337524414, - "learning_rate": 7.825454545454546e-06, - "loss": 0.0596, - "step": 599 - }, - { - "epoch": 10.909090909090908, - "grad_norm": 1.4187803268432617, - "learning_rate": 7.821818181818183e-06, - "loss": 0.0615, - "step": 600 - }, - { - "epoch": 10.927272727272728, - "grad_norm": 2.181539535522461, - "learning_rate": 7.81818181818182e-06, - "loss": 0.0811, - "step": 601 - }, - { - "epoch": 10.945454545454545, - "grad_norm": 3.3171987533569336, - "learning_rate": 7.814545454545455e-06, - "loss": 0.0697, - "step": 602 - }, - { - "epoch": 10.963636363636363, - "grad_norm": 2.7163238525390625, - "learning_rate": 7.810909090909091e-06, - "loss": 0.0472, - "step": 603 - }, - { - "epoch": 10.981818181818182, - "grad_norm": 3.5020272731781006, - "learning_rate": 7.807272727272728e-06, - "loss": 0.0611, - "step": 604 - }, - { - "epoch": 11.0, - "grad_norm": 5.216598987579346, - "learning_rate": 7.803636363636365e-06, - "loss": 0.0788, - "step": 605 - }, - { - "epoch": 11.0, - "eval_loss": 0.0667845606803894, - "eval_runtime": 9.3233, - "eval_samples_per_second": 584.023, - "eval_steps_per_second": 73.043, - "step": 605 - }, - { - "epoch": 11.018181818181818, - "grad_norm": 3.8007099628448486, - "learning_rate": 7.800000000000002e-06, - "loss": 0.0815, - "step": 606 - }, - { - "epoch": 11.036363636363637, - "grad_norm": 0.8711565136909485, - "learning_rate": 7.796363636363638e-06, - "loss": 0.0536, - "step": 607 - }, - { - "epoch": 11.054545454545455, - "grad_norm": 1.9247506856918335, - "learning_rate": 7.792727272727273e-06, - "loss": 0.057, - "step": 608 - }, - { - "epoch": 11.072727272727272, - "grad_norm": 0.9094435572624207, - "learning_rate": 7.78909090909091e-06, - "loss": 0.0803, - "step": 609 - }, - { - "epoch": 11.090909090909092, - "grad_norm": 1.1869723796844482, - "learning_rate": 7.785454545454547e-06, - "loss": 0.0686, - "step": 610 - }, - { - "epoch": 11.10909090909091, - "grad_norm": 1.04933762550354, - "learning_rate": 7.781818181818183e-06, - "loss": 0.0648, - "step": 611 - }, - { - "epoch": 11.127272727272727, - "grad_norm": 0.19710782170295715, - "learning_rate": 7.778181818181818e-06, - "loss": 0.061, - "step": 612 - }, - { - "epoch": 11.145454545454545, - "grad_norm": 3.3665642738342285, - "learning_rate": 7.774545454545455e-06, - "loss": 0.059, - "step": 613 - }, - { - "epoch": 11.163636363636364, - "grad_norm": 0.30850616097450256, - "learning_rate": 7.770909090909092e-06, - "loss": 0.0648, - "step": 614 - }, - { - "epoch": 11.181818181818182, - "grad_norm": 1.201808214187622, - "learning_rate": 7.767272727272729e-06, - "loss": 0.0836, - "step": 615 - }, - { - "epoch": 11.2, - "grad_norm": 0.4459249973297119, - "learning_rate": 7.763636363636364e-06, - "loss": 0.0626, - "step": 616 - }, - { - "epoch": 11.218181818181819, - "grad_norm": 1.6041003465652466, - "learning_rate": 7.76e-06, - "loss": 0.0668, - "step": 617 - }, - { - "epoch": 11.236363636363636, - "grad_norm": 0.3569484353065491, - "learning_rate": 7.756363636363637e-06, - "loss": 0.0526, - "step": 618 - }, - { - "epoch": 11.254545454545454, - "grad_norm": 1.8243213891983032, - "learning_rate": 7.752727272727274e-06, - "loss": 0.0556, - "step": 619 - }, - { - "epoch": 11.272727272727273, - "grad_norm": 0.8160420060157776, - "learning_rate": 7.749090909090909e-06, - "loss": 0.0637, - "step": 620 - }, - { - "epoch": 11.290909090909091, - "grad_norm": 3.0070366859436035, - "learning_rate": 7.745454545454545e-06, - "loss": 0.0515, - "step": 621 - }, - { - "epoch": 11.309090909090909, - "grad_norm": 2.396848678588867, - "learning_rate": 7.741818181818182e-06, - "loss": 0.0529, - "step": 622 - }, - { - "epoch": 11.327272727272728, - "grad_norm": 0.31223157048225403, - "learning_rate": 7.738181818181819e-06, - "loss": 0.0483, - "step": 623 - }, - { - "epoch": 11.345454545454546, - "grad_norm": 1.1597321033477783, - "learning_rate": 7.734545454545455e-06, - "loss": 0.0682, - "step": 624 - }, - { - "epoch": 11.363636363636363, - "grad_norm": 1.0534957647323608, - "learning_rate": 7.73090909090909e-06, - "loss": 0.0586, - "step": 625 - }, - { - "epoch": 11.381818181818183, - "grad_norm": 1.6788127422332764, - "learning_rate": 7.727272727272727e-06, - "loss": 0.0629, - "step": 626 - }, - { - "epoch": 11.4, - "grad_norm": 1.028563141822815, - "learning_rate": 7.723636363636364e-06, - "loss": 0.0651, - "step": 627 - }, - { - "epoch": 11.418181818181818, - "grad_norm": 1.6237061023712158, - "learning_rate": 7.72e-06, - "loss": 0.0522, - "step": 628 - }, - { - "epoch": 11.436363636363636, - "grad_norm": 1.3759944438934326, - "learning_rate": 7.716363636363637e-06, - "loss": 0.0675, - "step": 629 - }, - { - "epoch": 11.454545454545455, - "grad_norm": 1.3063701391220093, - "learning_rate": 7.712727272727272e-06, - "loss": 0.0558, - "step": 630 - }, - { - "epoch": 11.472727272727273, - "grad_norm": 0.7629415392875671, - "learning_rate": 7.709090909090909e-06, - "loss": 0.0836, - "step": 631 - }, - { - "epoch": 11.49090909090909, - "grad_norm": 0.5609139800071716, - "learning_rate": 7.705454545454546e-06, - "loss": 0.0763, - "step": 632 - }, - { - "epoch": 11.50909090909091, - "grad_norm": 0.39255690574645996, - "learning_rate": 7.701818181818182e-06, - "loss": 0.0532, - "step": 633 - }, - { - "epoch": 11.527272727272727, - "grad_norm": 0.5675791501998901, - "learning_rate": 7.69818181818182e-06, - "loss": 0.0734, - "step": 634 - }, - { - "epoch": 11.545454545454545, - "grad_norm": 0.4140382707118988, - "learning_rate": 7.694545454545456e-06, - "loss": 0.0386, - "step": 635 - }, - { - "epoch": 11.563636363636364, - "grad_norm": 1.062186360359192, - "learning_rate": 7.690909090909091e-06, - "loss": 0.0594, - "step": 636 - }, - { - "epoch": 11.581818181818182, - "grad_norm": 1.7927665710449219, - "learning_rate": 7.687272727272728e-06, - "loss": 0.0697, - "step": 637 - }, - { - "epoch": 11.6, - "grad_norm": 2.9303150177001953, - "learning_rate": 7.683636363636364e-06, - "loss": 0.0708, - "step": 638 - }, - { - "epoch": 11.618181818181817, - "grad_norm": 5.85276460647583, - "learning_rate": 7.680000000000001e-06, - "loss": 0.0897, - "step": 639 - }, - { - "epoch": 11.636363636363637, - "grad_norm": 4.171004772186279, - "learning_rate": 7.676363636363638e-06, - "loss": 0.0611, - "step": 640 - }, - { - "epoch": 11.654545454545454, - "grad_norm": 1.3046609163284302, - "learning_rate": 7.672727272727273e-06, - "loss": 0.0513, - "step": 641 - }, - { - "epoch": 11.672727272727272, - "grad_norm": 5.081771373748779, - "learning_rate": 7.66909090909091e-06, - "loss": 0.0648, - "step": 642 - }, - { - "epoch": 11.690909090909091, - "grad_norm": 7.371913909912109, - "learning_rate": 7.665454545454546e-06, - "loss": 0.0726, - "step": 643 - }, - { - "epoch": 11.709090909090909, - "grad_norm": 7.019650936126709, - "learning_rate": 7.661818181818183e-06, - "loss": 0.0924, - "step": 644 - }, - { - "epoch": 11.727272727272727, - "grad_norm": 5.571925163269043, - "learning_rate": 7.65818181818182e-06, - "loss": 0.0757, - "step": 645 - }, - { - "epoch": 11.745454545454546, - "grad_norm": 1.481839656829834, - "learning_rate": 7.654545454545456e-06, - "loss": 0.0683, - "step": 646 - }, - { - "epoch": 11.763636363636364, - "grad_norm": 6.706080913543701, - "learning_rate": 7.650909090909091e-06, - "loss": 0.0705, - "step": 647 - }, - { - "epoch": 11.781818181818181, - "grad_norm": 10.265091896057129, - "learning_rate": 7.647272727272728e-06, - "loss": 0.0923, - "step": 648 - }, - { - "epoch": 11.8, - "grad_norm": 10.33637809753418, - "learning_rate": 7.643636363636365e-06, - "loss": 0.0819, - "step": 649 - }, - { - "epoch": 11.818181818181818, - "grad_norm": 8.443577766418457, - "learning_rate": 7.640000000000001e-06, - "loss": 0.0779, - "step": 650 - }, - { - "epoch": 11.836363636363636, - "grad_norm": 4.675793647766113, - "learning_rate": 7.636363636363638e-06, - "loss": 0.0692, - "step": 651 - }, - { - "epoch": 11.854545454545455, - "grad_norm": 1.7131801843643188, - "learning_rate": 7.632727272727273e-06, - "loss": 0.0634, - "step": 652 - }, - { - "epoch": 11.872727272727273, - "grad_norm": 4.444218635559082, - "learning_rate": 7.629090909090909e-06, - "loss": 0.067, - "step": 653 - }, - { - "epoch": 11.89090909090909, - "grad_norm": 5.929459095001221, - "learning_rate": 7.625454545454546e-06, - "loss": 0.0782, - "step": 654 - }, - { - "epoch": 11.909090909090908, - "grad_norm": 3.1463005542755127, - "learning_rate": 7.621818181818182e-06, - "loss": 0.0599, - "step": 655 - }, - { - "epoch": 11.927272727272728, - "grad_norm": 1.6024874448776245, - "learning_rate": 7.618181818181819e-06, - "loss": 0.0606, - "step": 656 - }, - { - "epoch": 11.945454545454545, - "grad_norm": 2.381667375564575, - "learning_rate": 7.614545454545456e-06, - "loss": 0.0766, - "step": 657 - }, - { - "epoch": 11.963636363636363, - "grad_norm": 2.4177682399749756, - "learning_rate": 7.610909090909091e-06, - "loss": 0.079, - "step": 658 - }, - { - "epoch": 11.981818181818182, - "grad_norm": 1.4133063554763794, - "learning_rate": 7.6072727272727275e-06, - "loss": 0.0608, - "step": 659 - }, - { - "epoch": 12.0, - "grad_norm": 1.634961724281311, - "learning_rate": 7.603636363636364e-06, - "loss": 0.0517, - "step": 660 - }, - { - "epoch": 12.0, - "eval_loss": 0.05948838219046593, - "eval_runtime": 9.4861, - "eval_samples_per_second": 573.995, - "eval_steps_per_second": 71.789, - "step": 660 - }, - { - "epoch": 12.018181818181818, - "grad_norm": 1.452169418334961, - "learning_rate": 7.600000000000001e-06, - "loss": 0.06, - "step": 661 - }, - { - "epoch": 12.036363636363637, - "grad_norm": 2.366448163986206, - "learning_rate": 7.596363636363638e-06, - "loss": 0.0792, - "step": 662 - }, - { - "epoch": 12.054545454545455, - "grad_norm": 5.218116760253906, - "learning_rate": 7.5927272727272735e-06, - "loss": 0.0727, - "step": 663 - }, - { - "epoch": 12.072727272727272, - "grad_norm": 4.4769744873046875, - "learning_rate": 7.589090909090909e-06, - "loss": 0.0794, - "step": 664 - }, - { - "epoch": 12.090909090909092, - "grad_norm": 1.2579691410064697, - "learning_rate": 7.585454545454546e-06, - "loss": 0.0564, - "step": 665 - }, - { - "epoch": 12.10909090909091, - "grad_norm": 5.880563735961914, - "learning_rate": 7.581818181818183e-06, - "loss": 0.084, - "step": 666 - }, - { - "epoch": 12.127272727272727, - "grad_norm": 6.151512622833252, - "learning_rate": 7.578181818181819e-06, - "loss": 0.0727, - "step": 667 - }, - { - "epoch": 12.145454545454545, - "grad_norm": 8.321369171142578, - "learning_rate": 7.574545454545455e-06, - "loss": 0.1126, - "step": 668 - }, - { - "epoch": 12.163636363636364, - "grad_norm": 4.0931572914123535, - "learning_rate": 7.570909090909091e-06, - "loss": 0.0633, - "step": 669 - }, - { - "epoch": 12.181818181818182, - "grad_norm": 0.2873873710632324, - "learning_rate": 7.567272727272728e-06, - "loss": 0.0564, - "step": 670 - }, - { - "epoch": 12.2, - "grad_norm": 2.830284595489502, - "learning_rate": 7.563636363636364e-06, - "loss": 0.0508, - "step": 671 - }, - { - "epoch": 12.218181818181819, - "grad_norm": 5.350972652435303, - "learning_rate": 7.5600000000000005e-06, - "loss": 0.0671, - "step": 672 - }, - { - "epoch": 12.236363636363636, - "grad_norm": 4.041622638702393, - "learning_rate": 7.556363636363637e-06, - "loss": 0.0674, - "step": 673 - }, - { - "epoch": 12.254545454545454, - "grad_norm": 1.149750828742981, - "learning_rate": 7.552727272727274e-06, - "loss": 0.0601, - "step": 674 - }, - { - "epoch": 12.272727272727273, - "grad_norm": 2.830841064453125, - "learning_rate": 7.549090909090909e-06, - "loss": 0.0787, - "step": 675 - }, - { - "epoch": 12.290909090909091, - "grad_norm": 0.3369740843772888, - "learning_rate": 7.545454545454546e-06, - "loss": 0.0523, - "step": 676 - }, - { - "epoch": 12.309090909090909, - "grad_norm": 0.6564288139343262, - "learning_rate": 7.541818181818182e-06, - "loss": 0.0777, - "step": 677 - }, - { - "epoch": 12.327272727272728, - "grad_norm": 0.44555386900901794, - "learning_rate": 7.538181818181819e-06, - "loss": 0.0646, - "step": 678 - }, - { - "epoch": 12.345454545454546, - "grad_norm": 1.3871268033981323, - "learning_rate": 7.534545454545456e-06, - "loss": 0.0692, - "step": 679 - }, - { - "epoch": 12.363636363636363, - "grad_norm": 0.6446318626403809, - "learning_rate": 7.530909090909092e-06, - "loss": 0.0585, - "step": 680 - }, - { - "epoch": 12.381818181818183, - "grad_norm": 0.3868294954299927, - "learning_rate": 7.5272727272727274e-06, - "loss": 0.0406, - "step": 681 - }, - { - "epoch": 12.4, - "grad_norm": 0.6543391942977905, - "learning_rate": 7.523636363636364e-06, - "loss": 0.0747, - "step": 682 - }, - { - "epoch": 12.418181818181818, - "grad_norm": 2.561845541000366, - "learning_rate": 7.520000000000001e-06, - "loss": 0.0701, - "step": 683 - }, - { - "epoch": 12.436363636363636, - "grad_norm": 1.3928570747375488, - "learning_rate": 7.5163636363636376e-06, - "loss": 0.0706, - "step": 684 - }, - { - "epoch": 12.454545454545455, - "grad_norm": 0.7241390943527222, - "learning_rate": 7.512727272727273e-06, - "loss": 0.07, - "step": 685 - }, - { - "epoch": 12.472727272727273, - "grad_norm": 2.677582025527954, - "learning_rate": 7.509090909090909e-06, - "loss": 0.0587, - "step": 686 - }, - { - "epoch": 12.49090909090909, - "grad_norm": 1.1967884302139282, - "learning_rate": 7.505454545454546e-06, - "loss": 0.0547, - "step": 687 - }, - { - "epoch": 12.50909090909091, - "grad_norm": 4.837310791015625, - "learning_rate": 7.501818181818183e-06, - "loss": 0.0675, - "step": 688 - }, - { - "epoch": 12.527272727272727, - "grad_norm": 3.8276240825653076, - "learning_rate": 7.4981818181818185e-06, - "loss": 0.0622, - "step": 689 - }, - { - "epoch": 12.545454545454545, - "grad_norm": 1.927840232849121, - "learning_rate": 7.494545454545455e-06, - "loss": 0.0547, - "step": 690 - }, - { - "epoch": 12.563636363636364, - "grad_norm": 1.5133333206176758, - "learning_rate": 7.490909090909092e-06, - "loss": 0.0655, - "step": 691 - }, - { - "epoch": 12.581818181818182, - "grad_norm": 2.5338287353515625, - "learning_rate": 7.487272727272728e-06, - "loss": 0.0548, - "step": 692 - }, - { - "epoch": 12.6, - "grad_norm": 0.7447697520256042, - "learning_rate": 7.483636363636364e-06, - "loss": 0.0709, - "step": 693 - }, - { - "epoch": 12.618181818181817, - "grad_norm": 0.9906661510467529, - "learning_rate": 7.48e-06, - "loss": 0.0693, - "step": 694 - }, - { - "epoch": 12.636363636363637, - "grad_norm": 0.770124614238739, - "learning_rate": 7.476363636363637e-06, - "loss": 0.0598, - "step": 695 - }, - { - "epoch": 12.654545454545454, - "grad_norm": 1.2254608869552612, - "learning_rate": 7.472727272727274e-06, - "loss": 0.0526, - "step": 696 - }, - { - "epoch": 12.672727272727272, - "grad_norm": 0.2181709259748459, - "learning_rate": 7.469090909090909e-06, - "loss": 0.0721, - "step": 697 - }, - { - "epoch": 12.690909090909091, - "grad_norm": 0.9091073274612427, - "learning_rate": 7.4654545454545455e-06, - "loss": 0.0543, - "step": 698 - }, - { - "epoch": 12.709090909090909, - "grad_norm": 1.4324709177017212, - "learning_rate": 7.461818181818182e-06, - "loss": 0.0544, - "step": 699 - }, - { - "epoch": 12.727272727272727, - "grad_norm": 2.195816993713379, - "learning_rate": 7.458181818181819e-06, - "loss": 0.063, - "step": 700 - }, - { - "epoch": 12.745454545454546, - "grad_norm": 1.558327078819275, - "learning_rate": 7.454545454545456e-06, - "loss": 0.0564, - "step": 701 - }, - { - "epoch": 12.763636363636364, - "grad_norm": 0.8436335921287537, - "learning_rate": 7.450909090909092e-06, - "loss": 0.0588, - "step": 702 - }, - { - "epoch": 12.781818181818181, - "grad_norm": 0.7248838543891907, - "learning_rate": 7.447272727272727e-06, - "loss": 0.069, - "step": 703 - }, - { - "epoch": 12.8, - "grad_norm": 2.033984899520874, - "learning_rate": 7.443636363636364e-06, - "loss": 0.0642, - "step": 704 - }, - { - "epoch": 12.818181818181818, - "grad_norm": 0.9111160635948181, - "learning_rate": 7.440000000000001e-06, - "loss": 0.0547, - "step": 705 - }, - { - "epoch": 12.836363636363636, - "grad_norm": 0.45176464319229126, - "learning_rate": 7.4363636363636375e-06, - "loss": 0.0521, - "step": 706 - }, - { - "epoch": 12.854545454545455, - "grad_norm": 1.3567719459533691, - "learning_rate": 7.432727272727273e-06, - "loss": 0.0646, - "step": 707 - }, - { - "epoch": 12.872727272727273, - "grad_norm": 0.5457467436790466, - "learning_rate": 7.42909090909091e-06, - "loss": 0.0545, - "step": 708 - }, - { - "epoch": 12.89090909090909, - "grad_norm": 0.24262972176074982, - "learning_rate": 7.425454545454546e-06, - "loss": 0.0668, - "step": 709 - }, - { - "epoch": 12.909090909090908, - "grad_norm": 1.255255103111267, - "learning_rate": 7.421818181818183e-06, - "loss": 0.0621, - "step": 710 - }, - { - "epoch": 12.927272727272728, - "grad_norm": 1.5009714365005493, - "learning_rate": 7.4181818181818185e-06, - "loss": 0.0611, - "step": 711 - }, - { - "epoch": 12.945454545454545, - "grad_norm": 0.5298284888267517, - "learning_rate": 7.414545454545455e-06, - "loss": 0.0749, - "step": 712 - }, - { - "epoch": 12.963636363636363, - "grad_norm": 1.8827316761016846, - "learning_rate": 7.410909090909092e-06, - "loss": 0.0434, - "step": 713 - }, - { - "epoch": 12.981818181818182, - "grad_norm": 2.692812204360962, - "learning_rate": 7.407272727272728e-06, - "loss": 0.0662, - "step": 714 - }, - { - "epoch": 13.0, - "grad_norm": 3.337789535522461, - "learning_rate": 7.403636363636364e-06, - "loss": 0.0665, - "step": 715 - }, - { - "epoch": 13.0, - "eval_loss": 0.06334304809570312, - "eval_runtime": 8.809, - "eval_samples_per_second": 618.116, - "eval_steps_per_second": 77.307, - "step": 715 - }, - { - "epoch": 13.018181818181818, - "grad_norm": 0.9326932430267334, - "learning_rate": 7.4e-06, - "loss": 0.0669, - "step": 716 - }, - { - "epoch": 13.036363636363637, - "grad_norm": 1.3411608934402466, - "learning_rate": 7.396363636363637e-06, - "loss": 0.0711, - "step": 717 - }, - { - "epoch": 13.054545454545455, - "grad_norm": 0.4815145432949066, - "learning_rate": 7.392727272727274e-06, - "loss": 0.082, - "step": 718 - }, - { - "epoch": 13.072727272727272, - "grad_norm": 0.2861839532852173, - "learning_rate": 7.38909090909091e-06, - "loss": 0.0724, - "step": 719 - }, - { - "epoch": 13.090909090909092, - "grad_norm": 0.5339202880859375, - "learning_rate": 7.3854545454545454e-06, - "loss": 0.0655, - "step": 720 - }, - { - "epoch": 13.10909090909091, - "grad_norm": 1.1817333698272705, - "learning_rate": 7.381818181818182e-06, - "loss": 0.0685, - "step": 721 - }, - { - "epoch": 13.127272727272727, - "grad_norm": 1.7013787031173706, - "learning_rate": 7.378181818181819e-06, - "loss": 0.0646, - "step": 722 - }, - { - "epoch": 13.145454545454545, - "grad_norm": 0.583050549030304, - "learning_rate": 7.3745454545454556e-06, - "loss": 0.0772, - "step": 723 - }, - { - "epoch": 13.163636363636364, - "grad_norm": 1.9750226736068726, - "learning_rate": 7.370909090909092e-06, - "loss": 0.0695, - "step": 724 - }, - { - "epoch": 13.181818181818182, - "grad_norm": 2.6900227069854736, - "learning_rate": 7.367272727272727e-06, - "loss": 0.0573, - "step": 725 - }, - { - "epoch": 13.2, - "grad_norm": 2.119184732437134, - "learning_rate": 7.363636363636364e-06, - "loss": 0.0593, - "step": 726 - }, - { - "epoch": 13.218181818181819, - "grad_norm": 0.5078383684158325, - "learning_rate": 7.360000000000001e-06, - "loss": 0.0535, - "step": 727 - }, - { - "epoch": 13.236363636363636, - "grad_norm": 0.7392340302467346, - "learning_rate": 7.356363636363637e-06, - "loss": 0.0519, - "step": 728 - }, - { - "epoch": 13.254545454545454, - "grad_norm": 1.93234384059906, - "learning_rate": 7.352727272727273e-06, - "loss": 0.0665, - "step": 729 - }, - { - "epoch": 13.272727272727273, - "grad_norm": 1.7226887941360474, - "learning_rate": 7.34909090909091e-06, - "loss": 0.0576, - "step": 730 - }, - { - "epoch": 13.290909090909091, - "grad_norm": 1.1805757284164429, - "learning_rate": 7.345454545454546e-06, - "loss": 0.0762, - "step": 731 - }, - { - "epoch": 13.309090909090909, - "grad_norm": 5.6733880043029785, - "learning_rate": 7.3418181818181825e-06, - "loss": 0.0581, - "step": 732 - }, - { - "epoch": 13.327272727272728, - "grad_norm": 3.243814706802368, - "learning_rate": 7.338181818181818e-06, - "loss": 0.0547, - "step": 733 - }, - { - "epoch": 13.345454545454546, - "grad_norm": 0.36170676350593567, - "learning_rate": 7.334545454545455e-06, - "loss": 0.0587, - "step": 734 - }, - { - "epoch": 13.363636363636363, - "grad_norm": 3.3307998180389404, - "learning_rate": 7.330909090909092e-06, - "loss": 0.0679, - "step": 735 - }, - { - "epoch": 13.381818181818183, - "grad_norm": 2.9988760948181152, - "learning_rate": 7.3272727272727285e-06, - "loss": 0.0578, - "step": 736 - }, - { - "epoch": 13.4, - "grad_norm": 2.31416916847229, - "learning_rate": 7.3236363636363635e-06, - "loss": 0.0857, - "step": 737 - }, - { - "epoch": 13.418181818181818, - "grad_norm": 0.20411837100982666, - "learning_rate": 7.32e-06, - "loss": 0.0494, - "step": 738 - }, - { - "epoch": 13.436363636363636, - "grad_norm": 3.9992427825927734, - "learning_rate": 7.316363636363637e-06, - "loss": 0.0674, - "step": 739 - }, - { - "epoch": 13.454545454545455, - "grad_norm": 4.9325032234191895, - "learning_rate": 7.312727272727274e-06, - "loss": 0.0607, - "step": 740 - }, - { - "epoch": 13.472727272727273, - "grad_norm": 4.038264274597168, - "learning_rate": 7.30909090909091e-06, - "loss": 0.0541, - "step": 741 - }, - { - "epoch": 13.49090909090909, - "grad_norm": 1.163496494293213, - "learning_rate": 7.305454545454545e-06, - "loss": 0.0667, - "step": 742 - }, - { - "epoch": 13.50909090909091, - "grad_norm": 1.7079969644546509, - "learning_rate": 7.301818181818182e-06, - "loss": 0.0693, - "step": 743 - }, - { - "epoch": 13.527272727272727, - "grad_norm": 1.4930275678634644, - "learning_rate": 7.298181818181819e-06, - "loss": 0.0627, - "step": 744 - }, - { - "epoch": 13.545454545454545, - "grad_norm": 2.212484836578369, - "learning_rate": 7.2945454545454555e-06, - "loss": 0.0551, - "step": 745 - }, - { - "epoch": 13.563636363636364, - "grad_norm": 3.7220752239227295, - "learning_rate": 7.290909090909092e-06, - "loss": 0.0651, - "step": 746 - }, - { - "epoch": 13.581818181818182, - "grad_norm": 1.4656076431274414, - "learning_rate": 7.287272727272728e-06, - "loss": 0.0721, - "step": 747 - }, - { - "epoch": 13.6, - "grad_norm": 2.8756191730499268, - "learning_rate": 7.283636363636364e-06, - "loss": 0.0639, - "step": 748 - }, - { - "epoch": 13.618181818181817, - "grad_norm": 4.470875263214111, - "learning_rate": 7.280000000000001e-06, - "loss": 0.0833, - "step": 749 - }, - { - "epoch": 13.636363636363637, - "grad_norm": 2.450894355773926, - "learning_rate": 7.276363636363637e-06, - "loss": 0.0601, - "step": 750 - }, - { - "epoch": 13.654545454545454, - "grad_norm": 1.7537519931793213, - "learning_rate": 7.272727272727273e-06, - "loss": 0.0524, - "step": 751 - }, - { - "epoch": 13.672727272727272, - "grad_norm": 0.9635225534439087, - "learning_rate": 7.26909090909091e-06, - "loss": 0.0577, - "step": 752 - }, - { - "epoch": 13.690909090909091, - "grad_norm": 0.2555043399333954, - "learning_rate": 7.265454545454546e-06, - "loss": 0.0767, - "step": 753 - }, - { - "epoch": 13.709090909090909, - "grad_norm": 0.7031099200248718, - "learning_rate": 7.2618181818181824e-06, - "loss": 0.0448, - "step": 754 - }, - { - "epoch": 13.727272727272727, - "grad_norm": 0.528627336025238, - "learning_rate": 7.258181818181818e-06, - "loss": 0.0553, - "step": 755 - }, - { - "epoch": 13.745454545454546, - "grad_norm": 1.3477230072021484, - "learning_rate": 7.254545454545455e-06, - "loss": 0.0516, - "step": 756 - }, - { - "epoch": 13.763636363636364, - "grad_norm": 0.6291450262069702, - "learning_rate": 7.250909090909092e-06, - "loss": 0.0663, - "step": 757 - }, - { - "epoch": 13.781818181818181, - "grad_norm": 2.4294962882995605, - "learning_rate": 7.247272727272728e-06, - "loss": 0.06, - "step": 758 - }, - { - "epoch": 13.8, - "grad_norm": 3.0542869567871094, - "learning_rate": 7.2436363636363634e-06, - "loss": 0.0709, - "step": 759 - }, - { - "epoch": 13.818181818181818, - "grad_norm": 0.8724761009216309, - "learning_rate": 7.24e-06, - "loss": 0.0643, - "step": 760 - }, - { - "epoch": 13.836363636363636, - "grad_norm": 1.070035696029663, - "learning_rate": 7.236363636363637e-06, - "loss": 0.0496, - "step": 761 - }, - { - "epoch": 13.854545454545455, - "grad_norm": 1.2298158407211304, - "learning_rate": 7.2327272727272736e-06, - "loss": 0.0433, - "step": 762 - }, - { - "epoch": 13.872727272727273, - "grad_norm": 1.729230284690857, - "learning_rate": 7.22909090909091e-06, - "loss": 0.063, - "step": 763 - }, - { - "epoch": 13.89090909090909, - "grad_norm": 2.3149638175964355, - "learning_rate": 7.225454545454545e-06, - "loss": 0.0616, - "step": 764 - }, - { - "epoch": 13.909090909090908, - "grad_norm": 1.8914380073547363, - "learning_rate": 7.221818181818182e-06, - "loss": 0.065, - "step": 765 - }, - { - "epoch": 13.927272727272728, - "grad_norm": 0.33422544598579407, - "learning_rate": 7.218181818181819e-06, - "loss": 0.0696, - "step": 766 - }, - { - "epoch": 13.945454545454545, - "grad_norm": 2.497201919555664, - "learning_rate": 7.214545454545455e-06, - "loss": 0.0624, - "step": 767 - }, - { - "epoch": 13.963636363636363, - "grad_norm": 3.617107629776001, - "learning_rate": 7.210909090909092e-06, - "loss": 0.075, - "step": 768 - }, - { - "epoch": 13.981818181818182, - "grad_norm": 0.3448617160320282, - "learning_rate": 7.207272727272728e-06, - "loss": 0.0644, - "step": 769 - }, - { - "epoch": 14.0, - "grad_norm": 3.0629079341888428, - "learning_rate": 7.203636363636364e-06, - "loss": 0.0561, - "step": 770 - }, - { - "epoch": 14.0, - "eval_loss": 0.06116378307342529, - "eval_runtime": 8.9743, - "eval_samples_per_second": 606.735, - "eval_steps_per_second": 75.884, - "step": 770 - }, - { - "epoch": 14.018181818181818, - "grad_norm": 1.9670838117599487, - "learning_rate": 7.2000000000000005e-06, - "loss": 0.0655, - "step": 771 - }, - { - "epoch": 14.036363636363637, - "grad_norm": 1.5837658643722534, - "learning_rate": 7.196363636363637e-06, - "loss": 0.0618, - "step": 772 - }, - { - "epoch": 14.054545454545455, - "grad_norm": 3.973050832748413, - "learning_rate": 7.192727272727273e-06, - "loss": 0.0836, - "step": 773 - }, - { - "epoch": 14.072727272727272, - "grad_norm": 4.079521656036377, - "learning_rate": 7.18909090909091e-06, - "loss": 0.0748, - "step": 774 - }, - { - "epoch": 14.090909090909092, - "grad_norm": 4.901785373687744, - "learning_rate": 7.1854545454545465e-06, - "loss": 0.0756, - "step": 775 - }, - { - "epoch": 14.10909090909091, - "grad_norm": 0.4171094298362732, - "learning_rate": 7.181818181818182e-06, - "loss": 0.0843, - "step": 776 - }, - { - "epoch": 14.127272727272727, - "grad_norm": 2.980266571044922, - "learning_rate": 7.178181818181818e-06, - "loss": 0.05, - "step": 777 - }, - { - "epoch": 14.145454545454545, - "grad_norm": 3.1900100708007812, - "learning_rate": 7.174545454545455e-06, - "loss": 0.073, - "step": 778 - }, - { - "epoch": 14.163636363636364, - "grad_norm": 1.6871399879455566, - "learning_rate": 7.170909090909092e-06, - "loss": 0.0529, - "step": 779 - }, - { - "epoch": 14.181818181818182, - "grad_norm": 2.1130380630493164, - "learning_rate": 7.167272727272728e-06, - "loss": 0.0661, - "step": 780 - }, - { - "epoch": 14.2, - "grad_norm": 2.1416566371917725, - "learning_rate": 7.163636363636363e-06, - "loss": 0.0498, - "step": 781 - }, - { - "epoch": 14.218181818181819, - "grad_norm": 1.0796903371810913, - "learning_rate": 7.16e-06, - "loss": 0.0708, - "step": 782 - }, - { - "epoch": 14.236363636363636, - "grad_norm": 2.4314792156219482, - "learning_rate": 7.156363636363637e-06, - "loss": 0.0731, - "step": 783 - }, - { - "epoch": 14.254545454545454, - "grad_norm": 4.1142497062683105, - "learning_rate": 7.1527272727272735e-06, - "loss": 0.0668, - "step": 784 - }, - { - "epoch": 14.272727272727273, - "grad_norm": 1.5056047439575195, - "learning_rate": 7.14909090909091e-06, - "loss": 0.0702, - "step": 785 - }, - { - "epoch": 14.290909090909091, - "grad_norm": 2.4282941818237305, - "learning_rate": 7.145454545454547e-06, - "loss": 0.0608, - "step": 786 - }, - { - "epoch": 14.309090909090909, - "grad_norm": 3.727909564971924, - "learning_rate": 7.141818181818182e-06, - "loss": 0.073, - "step": 787 - }, - { - "epoch": 14.327272727272728, - "grad_norm": 0.965523898601532, - "learning_rate": 7.138181818181819e-06, - "loss": 0.0627, - "step": 788 - }, - { - "epoch": 14.345454545454546, - "grad_norm": 1.8487173318862915, - "learning_rate": 7.134545454545455e-06, - "loss": 0.07, - "step": 789 - }, - { - "epoch": 14.363636363636363, - "grad_norm": 4.959105491638184, - "learning_rate": 7.130909090909092e-06, - "loss": 0.0603, - "step": 790 - }, - { - "epoch": 14.381818181818183, - "grad_norm": 2.690051555633545, - "learning_rate": 7.127272727272728e-06, - "loss": 0.0609, - "step": 791 - }, - { - "epoch": 14.4, - "grad_norm": 1.641316294670105, - "learning_rate": 7.123636363636364e-06, - "loss": 0.0728, - "step": 792 - }, - { - "epoch": 14.418181818181818, - "grad_norm": 1.637271523475647, - "learning_rate": 7.1200000000000004e-06, - "loss": 0.0523, - "step": 793 - }, - { - "epoch": 14.436363636363636, - "grad_norm": 0.6300403475761414, - "learning_rate": 7.116363636363637e-06, - "loss": 0.0648, - "step": 794 - }, - { - "epoch": 14.454545454545455, - "grad_norm": 1.1752089262008667, - "learning_rate": 7.112727272727273e-06, - "loss": 0.061, - "step": 795 - }, - { - "epoch": 14.472727272727273, - "grad_norm": 1.7865952253341675, - "learning_rate": 7.10909090909091e-06, - "loss": 0.0488, - "step": 796 - }, - { - "epoch": 14.49090909090909, - "grad_norm": 2.0968446731567383, - "learning_rate": 7.105454545454546e-06, - "loss": 0.0637, - "step": 797 - }, - { - "epoch": 14.50909090909091, - "grad_norm": 1.6635384559631348, - "learning_rate": 7.101818181818182e-06, - "loss": 0.0416, - "step": 798 - }, - { - "epoch": 14.527272727272727, - "grad_norm": 0.4678679406642914, - "learning_rate": 7.098181818181818e-06, - "loss": 0.0547, - "step": 799 - }, - { - "epoch": 14.545454545454545, - "grad_norm": 1.2559705972671509, - "learning_rate": 7.094545454545455e-06, - "loss": 0.0784, - "step": 800 - }, - { - "epoch": 14.563636363636364, - "grad_norm": 1.2719247341156006, - "learning_rate": 7.0909090909090916e-06, - "loss": 0.0482, - "step": 801 - }, - { - "epoch": 14.581818181818182, - "grad_norm": 2.6837501525878906, - "learning_rate": 7.087272727272728e-06, - "loss": 0.0714, - "step": 802 - }, - { - "epoch": 14.6, - "grad_norm": 3.468778610229492, - "learning_rate": 7.083636363636365e-06, - "loss": 0.0603, - "step": 803 - }, - { - "epoch": 14.618181818181817, - "grad_norm": 1.1968810558319092, - "learning_rate": 7.08e-06, - "loss": 0.0742, - "step": 804 - }, - { - "epoch": 14.636363636363637, - "grad_norm": 1.5427498817443848, - "learning_rate": 7.076363636363637e-06, - "loss": 0.0571, - "step": 805 - }, - { - "epoch": 14.654545454545454, - "grad_norm": 2.570462942123413, - "learning_rate": 7.072727272727273e-06, - "loss": 0.0548, - "step": 806 - }, - { - "epoch": 14.672727272727272, - "grad_norm": 2.577770233154297, - "learning_rate": 7.06909090909091e-06, - "loss": 0.0511, - "step": 807 - }, - { - "epoch": 14.690909090909091, - "grad_norm": 0.5914002060890198, - "learning_rate": 7.065454545454547e-06, - "loss": 0.0443, - "step": 808 - }, - { - "epoch": 14.709090909090909, - "grad_norm": 2.572910785675049, - "learning_rate": 7.061818181818182e-06, - "loss": 0.0593, - "step": 809 - }, - { - "epoch": 14.727272727272727, - "grad_norm": 2.141205310821533, - "learning_rate": 7.0581818181818185e-06, - "loss": 0.0607, - "step": 810 - }, - { - "epoch": 14.745454545454546, - "grad_norm": 1.517813801765442, - "learning_rate": 7.054545454545455e-06, - "loss": 0.066, - "step": 811 - }, - { - "epoch": 14.763636363636364, - "grad_norm": 4.054256439208984, - "learning_rate": 7.050909090909092e-06, - "loss": 0.0736, - "step": 812 - }, - { - "epoch": 14.781818181818181, - "grad_norm": 6.789210319519043, - "learning_rate": 7.047272727272728e-06, - "loss": 0.0665, - "step": 813 - }, - { - "epoch": 14.8, - "grad_norm": 6.695540904998779, - "learning_rate": 7.0436363636363645e-06, - "loss": 0.0855, - "step": 814 - }, - { - "epoch": 14.818181818181818, - "grad_norm": 4.404475212097168, - "learning_rate": 7.04e-06, - "loss": 0.0647, - "step": 815 - }, - { - "epoch": 14.836363636363636, - "grad_norm": 1.5647752285003662, - "learning_rate": 7.036363636363637e-06, - "loss": 0.0608, - "step": 816 - }, - { - "epoch": 14.854545454545455, - "grad_norm": 2.161693811416626, - "learning_rate": 7.032727272727273e-06, - "loss": 0.0537, - "step": 817 - }, - { - "epoch": 14.872727272727273, - "grad_norm": 2.4333837032318115, - "learning_rate": 7.02909090909091e-06, - "loss": 0.0653, - "step": 818 - }, - { - "epoch": 14.89090909090909, - "grad_norm": 0.3157075345516205, - "learning_rate": 7.025454545454546e-06, - "loss": 0.0843, - "step": 819 - }, - { - "epoch": 14.909090909090908, - "grad_norm": 1.7722207307815552, - "learning_rate": 7.021818181818182e-06, - "loss": 0.0599, - "step": 820 - }, - { - "epoch": 14.927272727272728, - "grad_norm": 1.9363938570022583, - "learning_rate": 7.018181818181818e-06, - "loss": 0.049, - "step": 821 - }, - { - "epoch": 14.945454545454545, - "grad_norm": 1.2045873403549194, - "learning_rate": 7.014545454545455e-06, - "loss": 0.0586, - "step": 822 - }, - { - "epoch": 14.963636363636363, - "grad_norm": 0.5680413246154785, - "learning_rate": 7.0109090909090915e-06, - "loss": 0.0602, - "step": 823 - }, - { - "epoch": 14.981818181818182, - "grad_norm": 0.9456818103790283, - "learning_rate": 7.007272727272728e-06, - "loss": 0.0607, - "step": 824 - }, - { - "epoch": 15.0, - "grad_norm": 0.929862380027771, - "learning_rate": 7.003636363636365e-06, - "loss": 0.0572, - "step": 825 - }, - { - "epoch": 15.0, - "eval_loss": 0.06318649649620056, - "eval_runtime": 8.8288, - "eval_samples_per_second": 616.732, - "eval_steps_per_second": 77.134, - "step": 825 - }, - { - "epoch": 15.018181818181818, - "grad_norm": 3.6373069286346436, - "learning_rate": 7e-06, - "loss": 0.0711, - "step": 826 - }, - { - "epoch": 15.036363636363637, - "grad_norm": 4.059961795806885, - "learning_rate": 6.996363636363637e-06, - "loss": 0.0686, - "step": 827 - }, - { - "epoch": 15.054545454545455, - "grad_norm": 2.625581741333008, - "learning_rate": 6.992727272727273e-06, - "loss": 0.0611, - "step": 828 - }, - { - "epoch": 15.072727272727272, - "grad_norm": 1.742775559425354, - "learning_rate": 6.98909090909091e-06, - "loss": 0.0597, - "step": 829 - }, - { - "epoch": 15.090909090909092, - "grad_norm": 2.393937826156616, - "learning_rate": 6.985454545454547e-06, - "loss": 0.0633, - "step": 830 - }, - { - "epoch": 15.10909090909091, - "grad_norm": 2.3471293449401855, - "learning_rate": 6.981818181818183e-06, - "loss": 0.057, - "step": 831 - }, - { - "epoch": 15.127272727272727, - "grad_norm": 0.7524638772010803, - "learning_rate": 6.9781818181818184e-06, - "loss": 0.0668, - "step": 832 - }, - { - "epoch": 15.145454545454545, - "grad_norm": 1.1395450830459595, - "learning_rate": 6.974545454545455e-06, - "loss": 0.051, - "step": 833 - }, - { - "epoch": 15.163636363636364, - "grad_norm": 0.20304043591022491, - "learning_rate": 6.970909090909092e-06, - "loss": 0.0523, - "step": 834 - }, - { - "epoch": 15.181818181818182, - "grad_norm": 1.0048550367355347, - "learning_rate": 6.967272727272728e-06, - "loss": 0.0596, - "step": 835 - }, - { - "epoch": 15.2, - "grad_norm": 2.196291208267212, - "learning_rate": 6.963636363636364e-06, - "loss": 0.0834, - "step": 836 - }, - { - "epoch": 15.218181818181819, - "grad_norm": 0.9714677929878235, - "learning_rate": 6.96e-06, - "loss": 0.044, - "step": 837 - }, - { - "epoch": 15.236363636363636, - "grad_norm": 0.738624632358551, - "learning_rate": 6.956363636363637e-06, - "loss": 0.0667, - "step": 838 - }, - { - "epoch": 15.254545454545454, - "grad_norm": 1.9332901239395142, - "learning_rate": 6.952727272727273e-06, - "loss": 0.0563, - "step": 839 - }, - { - "epoch": 15.272727272727273, - "grad_norm": 1.8341258764266968, - "learning_rate": 6.9490909090909096e-06, - "loss": 0.0599, - "step": 840 - }, - { - "epoch": 15.290909090909091, - "grad_norm": 1.4763063192367554, - "learning_rate": 6.945454545454546e-06, - "loss": 0.0671, - "step": 841 - }, - { - "epoch": 15.309090909090909, - "grad_norm": 1.6250065565109253, - "learning_rate": 6.941818181818183e-06, - "loss": 0.0875, - "step": 842 - }, - { - "epoch": 15.327272727272728, - "grad_norm": 5.05354642868042, - "learning_rate": 6.938181818181818e-06, - "loss": 0.0733, - "step": 843 - }, - { - "epoch": 15.345454545454546, - "grad_norm": 4.39846658706665, - "learning_rate": 6.934545454545455e-06, - "loss": 0.056, - "step": 844 - }, - { - "epoch": 15.363636363636363, - "grad_norm": 1.0581274032592773, - "learning_rate": 6.930909090909091e-06, - "loss": 0.0688, - "step": 845 - }, - { - "epoch": 15.381818181818183, - "grad_norm": 3.610654354095459, - "learning_rate": 6.927272727272728e-06, - "loss": 0.069, - "step": 846 - }, - { - "epoch": 15.4, - "grad_norm": 4.264715194702148, - "learning_rate": 6.923636363636365e-06, - "loss": 0.0607, - "step": 847 - }, - { - "epoch": 15.418181818181818, - "grad_norm": 4.675809860229492, - "learning_rate": 6.92e-06, - "loss": 0.0746, - "step": 848 - }, - { - "epoch": 15.436363636363636, - "grad_norm": 2.7993361949920654, - "learning_rate": 6.9163636363636365e-06, - "loss": 0.065, - "step": 849 - }, - { - "epoch": 15.454545454545455, - "grad_norm": 1.712072491645813, - "learning_rate": 6.912727272727273e-06, - "loss": 0.0616, - "step": 850 - }, - { - "epoch": 15.472727272727273, - "grad_norm": 2.8242838382720947, - "learning_rate": 6.90909090909091e-06, - "loss": 0.0606, - "step": 851 - }, - { - "epoch": 15.49090909090909, - "grad_norm": 3.865076780319214, - "learning_rate": 6.905454545454547e-06, - "loss": 0.0748, - "step": 852 - }, - { - "epoch": 15.50909090909091, - "grad_norm": 0.25274738669395447, - "learning_rate": 6.9018181818181825e-06, - "loss": 0.0694, - "step": 853 - }, - { - "epoch": 15.527272727272727, - "grad_norm": 1.2096081972122192, - "learning_rate": 6.898181818181818e-06, - "loss": 0.0476, - "step": 854 - }, - { - "epoch": 15.545454545454545, - "grad_norm": 1.9555130004882812, - "learning_rate": 6.894545454545455e-06, - "loss": 0.0608, - "step": 855 - }, - { - "epoch": 15.563636363636364, - "grad_norm": 1.071932315826416, - "learning_rate": 6.890909090909092e-06, - "loss": 0.0671, - "step": 856 - }, - { - "epoch": 15.581818181818182, - "grad_norm": 3.3838398456573486, - "learning_rate": 6.887272727272728e-06, - "loss": 0.0514, - "step": 857 - }, - { - "epoch": 15.6, - "grad_norm": 4.951547622680664, - "learning_rate": 6.883636363636364e-06, - "loss": 0.0647, - "step": 858 - }, - { - "epoch": 15.618181818181817, - "grad_norm": 2.8567936420440674, - "learning_rate": 6.88e-06, - "loss": 0.0561, - "step": 859 - }, - { - "epoch": 15.636363636363637, - "grad_norm": 1.210578441619873, - "learning_rate": 6.876363636363637e-06, - "loss": 0.0626, - "step": 860 - }, - { - "epoch": 15.654545454545454, - "grad_norm": 4.644229888916016, - "learning_rate": 6.872727272727273e-06, - "loss": 0.0739, - "step": 861 - }, - { - "epoch": 15.672727272727272, - "grad_norm": 6.756345748901367, - "learning_rate": 6.8690909090909095e-06, - "loss": 0.0617, - "step": 862 - }, - { - "epoch": 15.690909090909091, - "grad_norm": 6.277345180511475, - "learning_rate": 6.865454545454546e-06, - "loss": 0.0571, - "step": 863 - }, - { - "epoch": 15.709090909090909, - "grad_norm": 4.759640693664551, - "learning_rate": 6.861818181818183e-06, - "loss": 0.0782, - "step": 864 - }, - { - "epoch": 15.727272727272727, - "grad_norm": 1.7211660146713257, - "learning_rate": 6.858181818181818e-06, - "loss": 0.0833, - "step": 865 - }, - { - "epoch": 15.745454545454546, - "grad_norm": 4.693413257598877, - "learning_rate": 6.854545454545455e-06, - "loss": 0.0668, - "step": 866 - }, - { - "epoch": 15.763636363636364, - "grad_norm": 8.411813735961914, - "learning_rate": 6.850909090909091e-06, - "loss": 0.0753, - "step": 867 - }, - { - "epoch": 15.781818181818181, - "grad_norm": 8.891500473022461, - "learning_rate": 6.847272727272728e-06, - "loss": 0.079, - "step": 868 - }, - { - "epoch": 15.8, - "grad_norm": 6.808396339416504, - "learning_rate": 6.843636363636365e-06, - "loss": 0.0741, - "step": 869 - }, - { - "epoch": 15.818181818181818, - "grad_norm": 3.882725715637207, - "learning_rate": 6.8400000000000014e-06, - "loss": 0.0638, - "step": 870 - }, - { - "epoch": 15.836363636363636, - "grad_norm": 2.0678040981292725, - "learning_rate": 6.8363636363636364e-06, - "loss": 0.058, - "step": 871 - }, - { - "epoch": 15.854545454545455, - "grad_norm": 4.197636604309082, - "learning_rate": 6.832727272727273e-06, - "loss": 0.068, - "step": 872 - }, - { - "epoch": 15.872727272727273, - "grad_norm": 2.265993595123291, - "learning_rate": 6.82909090909091e-06, - "loss": 0.0455, - "step": 873 - }, - { - "epoch": 15.89090909090909, - "grad_norm": 2.2964165210723877, - "learning_rate": 6.8254545454545466e-06, - "loss": 0.0667, - "step": 874 - }, - { - "epoch": 15.909090909090908, - "grad_norm": 2.934535264968872, - "learning_rate": 6.821818181818182e-06, - "loss": 0.0555, - "step": 875 - }, - { - "epoch": 15.927272727272728, - "grad_norm": 0.6989548206329346, - "learning_rate": 6.818181818181818e-06, - "loss": 0.0674, - "step": 876 - }, - { - "epoch": 15.945454545454545, - "grad_norm": 2.69486403465271, - "learning_rate": 6.814545454545455e-06, - "loss": 0.0605, - "step": 877 - }, - { - "epoch": 15.963636363636363, - "grad_norm": 1.1402214765548706, - "learning_rate": 6.810909090909092e-06, - "loss": 0.0821, - "step": 878 - }, - { - "epoch": 15.981818181818182, - "grad_norm": 3.696303129196167, - "learning_rate": 6.8072727272727275e-06, - "loss": 0.0639, - "step": 879 - }, - { - "epoch": 16.0, - "grad_norm": 6.102230072021484, - "learning_rate": 6.803636363636364e-06, - "loss": 0.0607, - "step": 880 - }, - { - "epoch": 16.0, - "eval_loss": 0.06663096696138382, - "eval_runtime": 9.402, - "eval_samples_per_second": 579.132, - "eval_steps_per_second": 72.431, - "step": 880 - }, - { - "epoch": 16.01818181818182, - "grad_norm": 5.73374080657959, - "learning_rate": 6.800000000000001e-06, - "loss": 0.0758, - "step": 881 - }, - { - "epoch": 16.036363636363635, - "grad_norm": 1.5274279117584229, - "learning_rate": 6.796363636363637e-06, - "loss": 0.0555, - "step": 882 - }, - { - "epoch": 16.054545454545455, - "grad_norm": 2.6027486324310303, - "learning_rate": 6.792727272727273e-06, - "loss": 0.0709, - "step": 883 - }, - { - "epoch": 16.072727272727274, - "grad_norm": 3.30393385887146, - "learning_rate": 6.789090909090909e-06, - "loss": 0.0668, - "step": 884 - }, - { - "epoch": 16.09090909090909, - "grad_norm": 2.3329808712005615, - "learning_rate": 6.785454545454546e-06, - "loss": 0.0735, - "step": 885 - }, - { - "epoch": 16.10909090909091, - "grad_norm": 1.1085752248764038, - "learning_rate": 6.781818181818183e-06, - "loss": 0.0765, - "step": 886 - }, - { - "epoch": 16.12727272727273, - "grad_norm": 2.6080358028411865, - "learning_rate": 6.778181818181818e-06, - "loss": 0.055, - "step": 887 - }, - { - "epoch": 16.145454545454545, - "grad_norm": 3.8366918563842773, - "learning_rate": 6.7745454545454545e-06, - "loss": 0.0577, - "step": 888 - }, - { - "epoch": 16.163636363636364, - "grad_norm": 3.671412467956543, - "learning_rate": 6.770909090909091e-06, - "loss": 0.0703, - "step": 889 - }, - { - "epoch": 16.181818181818183, - "grad_norm": 2.239098072052002, - "learning_rate": 6.767272727272728e-06, - "loss": 0.0698, - "step": 890 - }, - { - "epoch": 16.2, - "grad_norm": 1.8378283977508545, - "learning_rate": 6.763636363636365e-06, - "loss": 0.0691, - "step": 891 - }, - { - "epoch": 16.21818181818182, - "grad_norm": 4.188761234283447, - "learning_rate": 6.760000000000001e-06, - "loss": 0.0647, - "step": 892 - }, - { - "epoch": 16.236363636363638, - "grad_norm": 4.674147605895996, - "learning_rate": 6.756363636363636e-06, - "loss": 0.0663, - "step": 893 - }, - { - "epoch": 16.254545454545454, - "grad_norm": 3.115976572036743, - "learning_rate": 6.752727272727273e-06, - "loss": 0.0581, - "step": 894 - }, - { - "epoch": 16.272727272727273, - "grad_norm": 2.232649564743042, - "learning_rate": 6.74909090909091e-06, - "loss": 0.0579, - "step": 895 - }, - { - "epoch": 16.29090909090909, - "grad_norm": 2.7771944999694824, - "learning_rate": 6.7454545454545465e-06, - "loss": 0.06, - "step": 896 - }, - { - "epoch": 16.30909090909091, - "grad_norm": 2.712419271469116, - "learning_rate": 6.741818181818182e-06, - "loss": 0.0677, - "step": 897 - }, - { - "epoch": 16.327272727272728, - "grad_norm": 0.3379657566547394, - "learning_rate": 6.738181818181819e-06, - "loss": 0.0583, - "step": 898 - }, - { - "epoch": 16.345454545454544, - "grad_norm": 3.1065523624420166, - "learning_rate": 6.734545454545455e-06, - "loss": 0.0577, - "step": 899 - }, - { - "epoch": 16.363636363636363, - "grad_norm": 2.3098392486572266, - "learning_rate": 6.730909090909092e-06, - "loss": 0.065, - "step": 900 - }, - { - "epoch": 16.381818181818183, - "grad_norm": 1.4125452041625977, - "learning_rate": 6.7272727272727275e-06, - "loss": 0.0597, - "step": 901 - }, - { - "epoch": 16.4, - "grad_norm": 2.744288921356201, - "learning_rate": 6.723636363636364e-06, - "loss": 0.0625, - "step": 902 - }, - { - "epoch": 16.418181818181818, - "grad_norm": 2.9497387409210205, - "learning_rate": 6.720000000000001e-06, - "loss": 0.0572, - "step": 903 - }, - { - "epoch": 16.436363636363637, - "grad_norm": 2.866325855255127, - "learning_rate": 6.716363636363637e-06, - "loss": 0.0745, - "step": 904 - }, - { - "epoch": 16.454545454545453, - "grad_norm": 0.41356438398361206, - "learning_rate": 6.712727272727273e-06, - "loss": 0.0706, - "step": 905 - }, - { - "epoch": 16.472727272727273, - "grad_norm": 1.335402011871338, - "learning_rate": 6.709090909090909e-06, - "loss": 0.0724, - "step": 906 - }, - { - "epoch": 16.490909090909092, - "grad_norm": 1.8239550590515137, - "learning_rate": 6.705454545454546e-06, - "loss": 0.0554, - "step": 907 - }, - { - "epoch": 16.509090909090908, - "grad_norm": 0.5610803961753845, - "learning_rate": 6.701818181818183e-06, - "loss": 0.059, - "step": 908 - }, - { - "epoch": 16.527272727272727, - "grad_norm": 0.8413911461830139, - "learning_rate": 6.6981818181818194e-06, - "loss": 0.0533, - "step": 909 - }, - { - "epoch": 16.545454545454547, - "grad_norm": 1.0959774255752563, - "learning_rate": 6.6945454545454544e-06, - "loss": 0.0575, - "step": 910 - }, - { - "epoch": 16.563636363636363, - "grad_norm": 0.2551763355731964, - "learning_rate": 6.690909090909091e-06, - "loss": 0.052, - "step": 911 - }, - { - "epoch": 16.581818181818182, - "grad_norm": 2.0218007564544678, - "learning_rate": 6.687272727272728e-06, - "loss": 0.0653, - "step": 912 - }, - { - "epoch": 16.6, - "grad_norm": 0.8809184432029724, - "learning_rate": 6.6836363636363646e-06, - "loss": 0.0664, - "step": 913 - }, - { - "epoch": 16.618181818181817, - "grad_norm": 1.6229790449142456, - "learning_rate": 6.680000000000001e-06, - "loss": 0.0722, - "step": 914 - }, - { - "epoch": 16.636363636363637, - "grad_norm": 3.8639652729034424, - "learning_rate": 6.676363636363636e-06, - "loss": 0.0775, - "step": 915 - }, - { - "epoch": 16.654545454545456, - "grad_norm": 1.4158262014389038, - "learning_rate": 6.672727272727273e-06, - "loss": 0.0556, - "step": 916 - }, - { - "epoch": 16.672727272727272, - "grad_norm": 1.4040050506591797, - "learning_rate": 6.66909090909091e-06, - "loss": 0.0559, - "step": 917 - }, - { - "epoch": 16.69090909090909, - "grad_norm": 3.0734200477600098, - "learning_rate": 6.665454545454546e-06, - "loss": 0.0708, - "step": 918 - }, - { - "epoch": 16.70909090909091, - "grad_norm": 2.3759617805480957, - "learning_rate": 6.661818181818182e-06, - "loss": 0.0597, - "step": 919 - }, - { - "epoch": 16.727272727272727, - "grad_norm": 1.829919457435608, - "learning_rate": 6.658181818181819e-06, - "loss": 0.0474, - "step": 920 - }, - { - "epoch": 16.745454545454546, - "grad_norm": 3.470067024230957, - "learning_rate": 6.654545454545455e-06, - "loss": 0.0452, - "step": 921 - }, - { - "epoch": 16.763636363636365, - "grad_norm": 0.22983339428901672, - "learning_rate": 6.6509090909090915e-06, - "loss": 0.066, - "step": 922 - }, - { - "epoch": 16.78181818181818, - "grad_norm": 0.26110702753067017, - "learning_rate": 6.647272727272727e-06, - "loss": 0.0528, - "step": 923 - }, - { - "epoch": 16.8, - "grad_norm": 0.9159600734710693, - "learning_rate": 6.643636363636364e-06, - "loss": 0.0509, - "step": 924 - }, - { - "epoch": 16.818181818181817, - "grad_norm": 0.2624635398387909, - "learning_rate": 6.640000000000001e-06, - "loss": 0.0566, - "step": 925 - }, - { - "epoch": 16.836363636363636, - "grad_norm": 1.6430072784423828, - "learning_rate": 6.6363636363636375e-06, - "loss": 0.065, - "step": 926 - }, - { - "epoch": 16.854545454545455, - "grad_norm": 2.9276740550994873, - "learning_rate": 6.6327272727272725e-06, - "loss": 0.0398, - "step": 927 - }, - { - "epoch": 16.87272727272727, - "grad_norm": 1.1057707071304321, - "learning_rate": 6.629090909090909e-06, - "loss": 0.0711, - "step": 928 - }, - { - "epoch": 16.89090909090909, - "grad_norm": 0.45399510860443115, - "learning_rate": 6.625454545454546e-06, - "loss": 0.0651, - "step": 929 - }, - { - "epoch": 16.90909090909091, - "grad_norm": 1.3540892601013184, - "learning_rate": 6.621818181818183e-06, - "loss": 0.0708, - "step": 930 - }, - { - "epoch": 16.927272727272726, - "grad_norm": 0.8137788772583008, - "learning_rate": 6.618181818181819e-06, - "loss": 0.0712, - "step": 931 - }, - { - "epoch": 16.945454545454545, - "grad_norm": 0.2920357882976532, - "learning_rate": 6.614545454545454e-06, - "loss": 0.0705, - "step": 932 - }, - { - "epoch": 16.963636363636365, - "grad_norm": 0.30102020502090454, - "learning_rate": 6.610909090909091e-06, - "loss": 0.0535, - "step": 933 - }, - { - "epoch": 16.98181818181818, - "grad_norm": 0.34743496775627136, - "learning_rate": 6.607272727272728e-06, - "loss": 0.0504, - "step": 934 - }, - { - "epoch": 17.0, - "grad_norm": 0.22518983483314514, - "learning_rate": 6.6036363636363645e-06, - "loss": 0.0625, - "step": 935 - }, - { - "epoch": 17.0, - "eval_loss": 0.05811486765742302, - "eval_runtime": 8.9429, - "eval_samples_per_second": 608.864, - "eval_steps_per_second": 76.15, - "step": 935 - }, - { - "epoch": 17.01818181818182, - "grad_norm": 0.771604597568512, - "learning_rate": 6.600000000000001e-06, - "loss": 0.0588, - "step": 936 - }, - { - "epoch": 17.036363636363635, - "grad_norm": 0.4630626440048218, - "learning_rate": 6.596363636363637e-06, - "loss": 0.0503, - "step": 937 - }, - { - "epoch": 17.054545454545455, - "grad_norm": 2.7097997665405273, - "learning_rate": 6.592727272727273e-06, - "loss": 0.0688, - "step": 938 - }, - { - "epoch": 17.072727272727274, - "grad_norm": 1.3959273099899292, - "learning_rate": 6.58909090909091e-06, - "loss": 0.0714, - "step": 939 - }, - { - "epoch": 17.09090909090909, - "grad_norm": 1.1690318584442139, - "learning_rate": 6.585454545454546e-06, - "loss": 0.064, - "step": 940 - }, - { - "epoch": 17.10909090909091, - "grad_norm": 3.2116758823394775, - "learning_rate": 6.581818181818182e-06, - "loss": 0.0615, - "step": 941 - }, - { - "epoch": 17.12727272727273, - "grad_norm": 2.41903018951416, - "learning_rate": 6.578181818181819e-06, - "loss": 0.0594, - "step": 942 - }, - { - "epoch": 17.145454545454545, - "grad_norm": 2.730008125305176, - "learning_rate": 6.574545454545455e-06, - "loss": 0.0748, - "step": 943 - }, - { - "epoch": 17.163636363636364, - "grad_norm": 2.371711254119873, - "learning_rate": 6.5709090909090914e-06, - "loss": 0.0506, - "step": 944 - }, - { - "epoch": 17.181818181818183, - "grad_norm": 1.1392409801483154, - "learning_rate": 6.567272727272727e-06, - "loss": 0.0578, - "step": 945 - }, - { - "epoch": 17.2, - "grad_norm": 2.5779550075531006, - "learning_rate": 6.563636363636364e-06, - "loss": 0.0603, - "step": 946 - }, - { - "epoch": 17.21818181818182, - "grad_norm": 1.5214225053787231, - "learning_rate": 6.560000000000001e-06, - "loss": 0.0772, - "step": 947 - }, - { - "epoch": 17.236363636363638, - "grad_norm": 1.3445292711257935, - "learning_rate": 6.5563636363636374e-06, - "loss": 0.0541, - "step": 948 - }, - { - "epoch": 17.254545454545454, - "grad_norm": 1.7062091827392578, - "learning_rate": 6.5527272727272724e-06, - "loss": 0.0434, - "step": 949 - }, - { - "epoch": 17.272727272727273, - "grad_norm": 1.761246681213379, - "learning_rate": 6.549090909090909e-06, - "loss": 0.0685, - "step": 950 - }, - { - "epoch": 17.29090909090909, - "grad_norm": 1.5952452421188354, - "learning_rate": 6.545454545454546e-06, - "loss": 0.0561, - "step": 951 - }, - { - "epoch": 17.30909090909091, - "grad_norm": 0.6347488164901733, - "learning_rate": 6.5418181818181826e-06, - "loss": 0.0904, - "step": 952 - }, - { - "epoch": 17.327272727272728, - "grad_norm": 3.888216018676758, - "learning_rate": 6.538181818181819e-06, - "loss": 0.064, - "step": 953 - }, - { - "epoch": 17.345454545454544, - "grad_norm": 2.6645264625549316, - "learning_rate": 6.534545454545454e-06, - "loss": 0.0563, - "step": 954 - }, - { - "epoch": 17.363636363636363, - "grad_norm": 1.1593647003173828, - "learning_rate": 6.530909090909091e-06, - "loss": 0.0738, - "step": 955 - }, - { - "epoch": 17.381818181818183, - "grad_norm": 3.097285509109497, - "learning_rate": 6.527272727272728e-06, - "loss": 0.0726, - "step": 956 - }, - { - "epoch": 17.4, - "grad_norm": 3.4275588989257812, - "learning_rate": 6.523636363636364e-06, - "loss": 0.0676, - "step": 957 - }, - { - "epoch": 17.418181818181818, - "grad_norm": 4.157214641571045, - "learning_rate": 6.520000000000001e-06, - "loss": 0.0741, - "step": 958 - }, - { - "epoch": 17.436363636363637, - "grad_norm": 1.6189855337142944, - "learning_rate": 6.516363636363637e-06, - "loss": 0.055, - "step": 959 - }, - { - "epoch": 17.454545454545453, - "grad_norm": 2.81453275680542, - "learning_rate": 6.512727272727273e-06, - "loss": 0.0556, - "step": 960 - }, - { - "epoch": 17.472727272727273, - "grad_norm": 4.454223155975342, - "learning_rate": 6.5090909090909095e-06, - "loss": 0.0542, - "step": 961 - }, - { - "epoch": 17.490909090909092, - "grad_norm": 4.032587051391602, - "learning_rate": 6.505454545454546e-06, - "loss": 0.0713, - "step": 962 - }, - { - "epoch": 17.509090909090908, - "grad_norm": 2.2136881351470947, - "learning_rate": 6.501818181818182e-06, - "loss": 0.0583, - "step": 963 - }, - { - "epoch": 17.527272727272727, - "grad_norm": 0.9351394772529602, - "learning_rate": 6.498181818181819e-06, - "loss": 0.0443, - "step": 964 - }, - { - "epoch": 17.545454545454547, - "grad_norm": 3.596268892288208, - "learning_rate": 6.4945454545454555e-06, - "loss": 0.0603, - "step": 965 - }, - { - "epoch": 17.563636363636363, - "grad_norm": 4.032523155212402, - "learning_rate": 6.490909090909091e-06, - "loss": 0.0607, - "step": 966 - }, - { - "epoch": 17.581818181818182, - "grad_norm": 2.3243932723999023, - "learning_rate": 6.487272727272727e-06, - "loss": 0.0726, - "step": 967 - }, - { - "epoch": 17.6, - "grad_norm": 2.5019078254699707, - "learning_rate": 6.483636363636364e-06, - "loss": 0.063, - "step": 968 - }, - { - "epoch": 17.618181818181817, - "grad_norm": 3.305145025253296, - "learning_rate": 6.480000000000001e-06, - "loss": 0.0667, - "step": 969 - }, - { - "epoch": 17.636363636363637, - "grad_norm": 2.561406135559082, - "learning_rate": 6.476363636363637e-06, - "loss": 0.064, - "step": 970 - }, - { - "epoch": 17.654545454545456, - "grad_norm": 0.42631393671035767, - "learning_rate": 6.472727272727272e-06, - "loss": 0.0659, - "step": 971 - }, - { - "epoch": 17.672727272727272, - "grad_norm": 1.662918210029602, - "learning_rate": 6.469090909090909e-06, - "loss": 0.052, - "step": 972 - }, - { - "epoch": 17.69090909090909, - "grad_norm": 3.9338431358337402, - "learning_rate": 6.465454545454546e-06, - "loss": 0.0796, - "step": 973 - }, - { - "epoch": 17.70909090909091, - "grad_norm": 0.22917257249355316, - "learning_rate": 6.4618181818181825e-06, - "loss": 0.0716, - "step": 974 - }, - { - "epoch": 17.727272727272727, - "grad_norm": 0.3687666654586792, - "learning_rate": 6.458181818181819e-06, - "loss": 0.0627, - "step": 975 - }, - { - "epoch": 17.745454545454546, - "grad_norm": 1.4273935556411743, - "learning_rate": 6.454545454545456e-06, - "loss": 0.061, - "step": 976 - }, - { - "epoch": 17.763636363636365, - "grad_norm": 0.9296255111694336, - "learning_rate": 6.450909090909091e-06, - "loss": 0.0608, - "step": 977 - }, - { - "epoch": 17.78181818181818, - "grad_norm": 0.7088639140129089, - "learning_rate": 6.447272727272728e-06, - "loss": 0.0534, - "step": 978 - }, - { - "epoch": 17.8, - "grad_norm": 2.8425981998443604, - "learning_rate": 6.443636363636364e-06, - "loss": 0.0602, - "step": 979 - }, - { - "epoch": 17.818181818181817, - "grad_norm": 2.3232202529907227, - "learning_rate": 6.440000000000001e-06, - "loss": 0.0793, - "step": 980 - }, - { - "epoch": 17.836363636363636, - "grad_norm": 0.7211915850639343, - "learning_rate": 6.436363636363637e-06, - "loss": 0.0599, - "step": 981 - }, - { - "epoch": 17.854545454545455, - "grad_norm": 2.742971658706665, - "learning_rate": 6.432727272727273e-06, - "loss": 0.0561, - "step": 982 - }, - { - "epoch": 17.87272727272727, - "grad_norm": 3.969449520111084, - "learning_rate": 6.4290909090909094e-06, - "loss": 0.0687, - "step": 983 - }, - { - "epoch": 17.89090909090909, - "grad_norm": 3.6285219192504883, - "learning_rate": 6.425454545454546e-06, - "loss": 0.0622, - "step": 984 - }, - { - "epoch": 17.90909090909091, - "grad_norm": 2.3804738521575928, - "learning_rate": 6.421818181818182e-06, - "loss": 0.0614, - "step": 985 - }, - { - "epoch": 17.927272727272726, - "grad_norm": 2.352264165878296, - "learning_rate": 6.418181818181819e-06, - "loss": 0.0601, - "step": 986 - }, - { - "epoch": 17.945454545454545, - "grad_norm": 4.285254955291748, - "learning_rate": 6.4145454545454554e-06, - "loss": 0.06, - "step": 987 - }, - { - "epoch": 17.963636363636365, - "grad_norm": 4.792948246002197, - "learning_rate": 6.410909090909091e-06, - "loss": 0.0651, - "step": 988 - }, - { - "epoch": 17.98181818181818, - "grad_norm": 1.8418738842010498, - "learning_rate": 6.407272727272727e-06, - "loss": 0.0575, - "step": 989 - }, - { - "epoch": 18.0, - "grad_norm": 0.5135206580162048, - "learning_rate": 6.403636363636364e-06, - "loss": 0.0715, - "step": 990 - }, - { - "epoch": 18.0, - "eval_loss": 0.06388681381940842, - "eval_runtime": 8.4113, - "eval_samples_per_second": 647.342, - "eval_steps_per_second": 80.962, - "step": 990 - }, - { - "epoch": 18.01818181818182, - "grad_norm": 5.280696868896484, - "learning_rate": 6.4000000000000006e-06, - "loss": 0.0625, - "step": 991 - }, - { - "epoch": 18.036363636363635, - "grad_norm": 4.868581771850586, - "learning_rate": 6.396363636363637e-06, - "loss": 0.0751, - "step": 992 - }, - { - "epoch": 18.054545454545455, - "grad_norm": 3.52675724029541, - "learning_rate": 6.392727272727274e-06, - "loss": 0.0551, - "step": 993 - }, - { - "epoch": 18.072727272727274, - "grad_norm": 0.8013676404953003, - "learning_rate": 6.389090909090909e-06, - "loss": 0.0622, - "step": 994 - }, - { - "epoch": 18.09090909090909, - "grad_norm": 0.8263477683067322, - "learning_rate": 6.385454545454546e-06, - "loss": 0.0691, - "step": 995 - }, - { - "epoch": 18.10909090909091, - "grad_norm": 1.0651466846466064, - "learning_rate": 6.381818181818182e-06, - "loss": 0.0483, - "step": 996 - }, - { - "epoch": 18.12727272727273, - "grad_norm": 1.2623308897018433, - "learning_rate": 6.378181818181819e-06, - "loss": 0.0531, - "step": 997 - }, - { - "epoch": 18.145454545454545, - "grad_norm": 1.4461807012557983, - "learning_rate": 6.374545454545456e-06, - "loss": 0.0549, - "step": 998 - }, - { - "epoch": 18.163636363636364, - "grad_norm": 1.256278395652771, - "learning_rate": 6.370909090909091e-06, - "loss": 0.0595, - "step": 999 - }, - { - "epoch": 18.181818181818183, - "grad_norm": 0.6694340705871582, - "learning_rate": 6.3672727272727275e-06, - "loss": 0.0521, - "step": 1000 - }, - { - "epoch": 18.2, - "grad_norm": 0.4432702958583832, - "learning_rate": 6.363636363636364e-06, - "loss": 0.0572, - "step": 1001 - }, - { - "epoch": 18.21818181818182, - "grad_norm": 2.55157732963562, - "learning_rate": 6.360000000000001e-06, - "loss": 0.0558, - "step": 1002 - }, - { - "epoch": 18.236363636363638, - "grad_norm": 1.210538387298584, - "learning_rate": 6.356363636363637e-06, - "loss": 0.087, - "step": 1003 - }, - { - "epoch": 18.254545454545454, - "grad_norm": 1.076582908630371, - "learning_rate": 6.3527272727272735e-06, - "loss": 0.0432, - "step": 1004 - }, - { - "epoch": 18.272727272727273, - "grad_norm": 1.3988832235336304, - "learning_rate": 6.349090909090909e-06, - "loss": 0.0665, - "step": 1005 - }, - { - "epoch": 18.29090909090909, - "grad_norm": 0.3087432384490967, - "learning_rate": 6.345454545454546e-06, - "loss": 0.0717, - "step": 1006 - }, - { - "epoch": 18.30909090909091, - "grad_norm": 0.5725865960121155, - "learning_rate": 6.341818181818182e-06, - "loss": 0.0556, - "step": 1007 - }, - { - "epoch": 18.327272727272728, - "grad_norm": 0.5383421778678894, - "learning_rate": 6.338181818181819e-06, - "loss": 0.0646, - "step": 1008 - }, - { - "epoch": 18.345454545454544, - "grad_norm": 0.8586176633834839, - "learning_rate": 6.334545454545455e-06, - "loss": 0.0447, - "step": 1009 - }, - { - "epoch": 18.363636363636363, - "grad_norm": 0.43145492672920227, - "learning_rate": 6.330909090909091e-06, - "loss": 0.0584, - "step": 1010 - }, - { - "epoch": 18.381818181818183, - "grad_norm": 2.5964815616607666, - "learning_rate": 6.327272727272727e-06, - "loss": 0.0525, - "step": 1011 - }, - { - "epoch": 18.4, - "grad_norm": 1.5423589944839478, - "learning_rate": 6.323636363636364e-06, - "loss": 0.0766, - "step": 1012 - }, - { - "epoch": 18.418181818181818, - "grad_norm": 0.2282298058271408, - "learning_rate": 6.3200000000000005e-06, - "loss": 0.0602, - "step": 1013 - }, - { - "epoch": 18.436363636363637, - "grad_norm": 1.4999743700027466, - "learning_rate": 6.316363636363637e-06, - "loss": 0.0671, - "step": 1014 - }, - { - "epoch": 18.454545454545453, - "grad_norm": 0.5764297842979431, - "learning_rate": 6.312727272727274e-06, - "loss": 0.0545, - "step": 1015 - }, - { - "epoch": 18.472727272727273, - "grad_norm": 0.22203804552555084, - "learning_rate": 6.309090909090909e-06, - "loss": 0.064, - "step": 1016 - }, - { - "epoch": 18.490909090909092, - "grad_norm": 0.434139221906662, - "learning_rate": 6.305454545454546e-06, - "loss": 0.0711, - "step": 1017 - }, - { - "epoch": 18.509090909090908, - "grad_norm": 0.5400356650352478, - "learning_rate": 6.301818181818182e-06, - "loss": 0.0625, - "step": 1018 - }, - { - "epoch": 18.527272727272727, - "grad_norm": 1.9497101306915283, - "learning_rate": 6.298181818181819e-06, - "loss": 0.0677, - "step": 1019 - }, - { - "epoch": 18.545454545454547, - "grad_norm": 0.20958781242370605, - "learning_rate": 6.294545454545456e-06, - "loss": 0.079, - "step": 1020 - }, - { - "epoch": 18.563636363636363, - "grad_norm": 0.41773664951324463, - "learning_rate": 6.290909090909092e-06, - "loss": 0.0615, - "step": 1021 - }, - { - "epoch": 18.581818181818182, - "grad_norm": 0.3727003037929535, - "learning_rate": 6.2872727272727274e-06, - "loss": 0.0721, - "step": 1022 - }, - { - "epoch": 18.6, - "grad_norm": 0.42247825860977173, - "learning_rate": 6.283636363636364e-06, - "loss": 0.0535, - "step": 1023 - }, - { - "epoch": 18.618181818181817, - "grad_norm": 0.5484874844551086, - "learning_rate": 6.280000000000001e-06, - "loss": 0.0506, - "step": 1024 - }, - { - "epoch": 18.636363636363637, - "grad_norm": 0.5631684064865112, - "learning_rate": 6.276363636363637e-06, - "loss": 0.0628, - "step": 1025 - }, - { - "epoch": 18.654545454545456, - "grad_norm": 1.9881408214569092, - "learning_rate": 6.2727272727272734e-06, - "loss": 0.0588, - "step": 1026 - }, - { - "epoch": 18.672727272727272, - "grad_norm": 1.4796897172927856, - "learning_rate": 6.269090909090909e-06, - "loss": 0.0685, - "step": 1027 - }, - { - "epoch": 18.69090909090909, - "grad_norm": 2.1968111991882324, - "learning_rate": 6.265454545454546e-06, - "loss": 0.0665, - "step": 1028 - }, - { - "epoch": 18.70909090909091, - "grad_norm": 3.7253899574279785, - "learning_rate": 6.261818181818182e-06, - "loss": 0.0614, - "step": 1029 - }, - { - "epoch": 18.727272727272727, - "grad_norm": 4.07662296295166, - "learning_rate": 6.2581818181818186e-06, - "loss": 0.0705, - "step": 1030 - }, - { - "epoch": 18.745454545454546, - "grad_norm": 0.6291860938072205, - "learning_rate": 6.254545454545455e-06, - "loss": 0.0631, - "step": 1031 - }, - { - "epoch": 18.763636363636365, - "grad_norm": 2.946328639984131, - "learning_rate": 6.250909090909092e-06, - "loss": 0.0658, - "step": 1032 - }, - { - "epoch": 18.78181818181818, - "grad_norm": 3.962503433227539, - "learning_rate": 6.247272727272727e-06, - "loss": 0.0615, - "step": 1033 - }, - { - "epoch": 18.8, - "grad_norm": 1.1344726085662842, - "learning_rate": 6.243636363636364e-06, - "loss": 0.0478, - "step": 1034 - }, - { - "epoch": 18.818181818181817, - "grad_norm": 1.5888065099716187, - "learning_rate": 6.24e-06, - "loss": 0.0763, - "step": 1035 - }, - { - "epoch": 18.836363636363636, - "grad_norm": 1.8329042196273804, - "learning_rate": 6.236363636363637e-06, - "loss": 0.0533, - "step": 1036 - }, - { - "epoch": 18.854545454545455, - "grad_norm": 0.7958198189735413, - "learning_rate": 6.232727272727274e-06, - "loss": 0.0585, - "step": 1037 - }, - { - "epoch": 18.87272727272727, - "grad_norm": 1.792304515838623, - "learning_rate": 6.229090909090909e-06, - "loss": 0.0552, - "step": 1038 - }, - { - "epoch": 18.89090909090909, - "grad_norm": 2.5960309505462646, - "learning_rate": 6.2254545454545455e-06, - "loss": 0.0617, - "step": 1039 - }, - { - "epoch": 18.90909090909091, - "grad_norm": 1.1085256338119507, - "learning_rate": 6.221818181818182e-06, - "loss": 0.0659, - "step": 1040 - }, - { - "epoch": 18.927272727272726, - "grad_norm": 1.9193938970565796, - "learning_rate": 6.218181818181819e-06, - "loss": 0.0501, - "step": 1041 - }, - { - "epoch": 18.945454545454545, - "grad_norm": 1.8350393772125244, - "learning_rate": 6.214545454545456e-06, - "loss": 0.0768, - "step": 1042 - }, - { - "epoch": 18.963636363636365, - "grad_norm": 2.0023930072784424, - "learning_rate": 6.2109090909090915e-06, - "loss": 0.0572, - "step": 1043 - }, - { - "epoch": 18.98181818181818, - "grad_norm": 1.6995385885238647, - "learning_rate": 6.207272727272727e-06, - "loss": 0.0633, - "step": 1044 - }, - { - "epoch": 19.0, - "grad_norm": 1.1081515550613403, - "learning_rate": 6.203636363636364e-06, - "loss": 0.0787, - "step": 1045 - }, - { - "epoch": 19.0, - "eval_loss": 0.05790654569864273, - "eval_runtime": 9.9182, - "eval_samples_per_second": 548.991, - "eval_steps_per_second": 68.662, - "step": 1045 - }, - { - "epoch": 19.01818181818182, - "grad_norm": 0.6994706392288208, - "learning_rate": 6.200000000000001e-06, - "loss": 0.0571, - "step": 1046 - }, - { - "epoch": 19.036363636363635, - "grad_norm": 3.161945104598999, - "learning_rate": 6.196363636363637e-06, - "loss": 0.0544, - "step": 1047 - }, - { - "epoch": 19.054545454545455, - "grad_norm": 1.6170823574066162, - "learning_rate": 6.192727272727273e-06, - "loss": 0.0575, - "step": 1048 - }, - { - "epoch": 19.072727272727274, - "grad_norm": 0.3531172275543213, - "learning_rate": 6.18909090909091e-06, - "loss": 0.049, - "step": 1049 - }, - { - "epoch": 19.09090909090909, - "grad_norm": 0.6149300932884216, - "learning_rate": 6.185454545454546e-06, - "loss": 0.0547, - "step": 1050 - }, - { - "epoch": 19.10909090909091, - "grad_norm": 0.43773573637008667, - "learning_rate": 6.181818181818182e-06, - "loss": 0.0552, - "step": 1051 - }, - { - "epoch": 19.12727272727273, - "grad_norm": 0.5945302248001099, - "learning_rate": 6.1781818181818185e-06, - "loss": 0.0627, - "step": 1052 - }, - { - "epoch": 19.145454545454545, - "grad_norm": 2.3801896572113037, - "learning_rate": 6.174545454545455e-06, - "loss": 0.0569, - "step": 1053 - }, - { - "epoch": 19.163636363636364, - "grad_norm": 1.65203058719635, - "learning_rate": 6.170909090909092e-06, - "loss": 0.0555, - "step": 1054 - }, - { - "epoch": 19.181818181818183, - "grad_norm": 0.5856815576553345, - "learning_rate": 6.167272727272727e-06, - "loss": 0.0585, - "step": 1055 - }, - { - "epoch": 19.2, - "grad_norm": 0.19977819919586182, - "learning_rate": 6.163636363636364e-06, - "loss": 0.0552, - "step": 1056 - }, - { - "epoch": 19.21818181818182, - "grad_norm": 2.1077768802642822, - "learning_rate": 6.16e-06, - "loss": 0.0705, - "step": 1057 - }, - { - "epoch": 19.236363636363638, - "grad_norm": 0.6714491248130798, - "learning_rate": 6.156363636363637e-06, - "loss": 0.0428, - "step": 1058 - }, - { - "epoch": 19.254545454545454, - "grad_norm": 1.0074713230133057, - "learning_rate": 6.152727272727274e-06, - "loss": 0.0573, - "step": 1059 - }, - { - "epoch": 19.272727272727273, - "grad_norm": 0.47077107429504395, - "learning_rate": 6.1490909090909104e-06, - "loss": 0.0657, - "step": 1060 - }, - { - "epoch": 19.29090909090909, - "grad_norm": 1.7142242193222046, - "learning_rate": 6.1454545454545454e-06, - "loss": 0.0822, - "step": 1061 - }, - { - "epoch": 19.30909090909091, - "grad_norm": 1.0456510782241821, - "learning_rate": 6.141818181818182e-06, - "loss": 0.0621, - "step": 1062 - }, - { - "epoch": 19.327272727272728, - "grad_norm": 1.4680742025375366, - "learning_rate": 6.138181818181819e-06, - "loss": 0.0557, - "step": 1063 - }, - { - "epoch": 19.345454545454544, - "grad_norm": 3.3751766681671143, - "learning_rate": 6.1345454545454556e-06, - "loss": 0.08, - "step": 1064 - }, - { - "epoch": 19.363636363636363, - "grad_norm": 1.4319839477539062, - "learning_rate": 6.1309090909090914e-06, - "loss": 0.0764, - "step": 1065 - }, - { - "epoch": 19.381818181818183, - "grad_norm": 1.221808910369873, - "learning_rate": 6.127272727272727e-06, - "loss": 0.0572, - "step": 1066 - }, - { - "epoch": 19.4, - "grad_norm": 2.8347368240356445, - "learning_rate": 6.123636363636364e-06, - "loss": 0.0539, - "step": 1067 - }, - { - "epoch": 19.418181818181818, - "grad_norm": 1.5616778135299683, - "learning_rate": 6.120000000000001e-06, - "loss": 0.074, - "step": 1068 - }, - { - "epoch": 19.436363636363637, - "grad_norm": 1.7054544687271118, - "learning_rate": 6.1163636363636366e-06, - "loss": 0.0672, - "step": 1069 - }, - { - "epoch": 19.454545454545453, - "grad_norm": 2.448561668395996, - "learning_rate": 6.112727272727273e-06, - "loss": 0.0475, - "step": 1070 - }, - { - "epoch": 19.472727272727273, - "grad_norm": 1.9597073793411255, - "learning_rate": 6.10909090909091e-06, - "loss": 0.0506, - "step": 1071 - }, - { - "epoch": 19.490909090909092, - "grad_norm": 3.202923536300659, - "learning_rate": 6.105454545454546e-06, - "loss": 0.0465, - "step": 1072 - }, - { - "epoch": 19.509090909090908, - "grad_norm": 2.6730706691741943, - "learning_rate": 6.101818181818182e-06, - "loss": 0.0527, - "step": 1073 - }, - { - "epoch": 19.527272727272727, - "grad_norm": 1.3344438076019287, - "learning_rate": 6.098181818181818e-06, - "loss": 0.0622, - "step": 1074 - }, - { - "epoch": 19.545454545454547, - "grad_norm": 0.7854332327842712, - "learning_rate": 6.094545454545455e-06, - "loss": 0.0645, - "step": 1075 - }, - { - "epoch": 19.563636363636363, - "grad_norm": 1.710201382637024, - "learning_rate": 6.090909090909092e-06, - "loss": 0.0507, - "step": 1076 - }, - { - "epoch": 19.581818181818182, - "grad_norm": 1.796575903892517, - "learning_rate": 6.087272727272727e-06, - "loss": 0.063, - "step": 1077 - }, - { - "epoch": 19.6, - "grad_norm": 0.23633906245231628, - "learning_rate": 6.0836363636363635e-06, - "loss": 0.0571, - "step": 1078 - }, - { - "epoch": 19.618181818181817, - "grad_norm": 2.7051212787628174, - "learning_rate": 6.08e-06, - "loss": 0.061, - "step": 1079 - }, - { - "epoch": 19.636363636363637, - "grad_norm": 4.209739685058594, - "learning_rate": 6.076363636363637e-06, - "loss": 0.0534, - "step": 1080 - }, - { - "epoch": 19.654545454545456, - "grad_norm": 0.5193652510643005, - "learning_rate": 6.072727272727274e-06, - "loss": 0.0672, - "step": 1081 - }, - { - "epoch": 19.672727272727272, - "grad_norm": 0.7180250287055969, - "learning_rate": 6.06909090909091e-06, - "loss": 0.0587, - "step": 1082 - }, - { - "epoch": 19.69090909090909, - "grad_norm": 2.748648166656494, - "learning_rate": 6.065454545454545e-06, - "loss": 0.0637, - "step": 1083 - }, - { - "epoch": 19.70909090909091, - "grad_norm": 3.1058483123779297, - "learning_rate": 6.061818181818182e-06, - "loss": 0.0756, - "step": 1084 - }, - { - "epoch": 19.727272727272727, - "grad_norm": 0.7504304647445679, - "learning_rate": 6.058181818181819e-06, - "loss": 0.0649, - "step": 1085 - }, - { - "epoch": 19.745454545454546, - "grad_norm": 1.066017508506775, - "learning_rate": 6.0545454545454555e-06, - "loss": 0.0599, - "step": 1086 - }, - { - "epoch": 19.763636363636365, - "grad_norm": 3.3258068561553955, - "learning_rate": 6.050909090909091e-06, - "loss": 0.0658, - "step": 1087 - }, - { - "epoch": 19.78181818181818, - "grad_norm": 0.5114735960960388, - "learning_rate": 6.047272727272728e-06, - "loss": 0.0901, - "step": 1088 - }, - { - "epoch": 19.8, - "grad_norm": 0.38989144563674927, - "learning_rate": 6.043636363636364e-06, - "loss": 0.0688, - "step": 1089 - }, - { - "epoch": 19.818181818181817, - "grad_norm": 0.3552303910255432, - "learning_rate": 6.040000000000001e-06, - "loss": 0.0557, - "step": 1090 - }, - { - "epoch": 19.836363636363636, - "grad_norm": 0.6409593224525452, - "learning_rate": 6.0363636363636365e-06, - "loss": 0.0585, - "step": 1091 - }, - { - "epoch": 19.854545454545455, - "grad_norm": 2.4680185317993164, - "learning_rate": 6.032727272727273e-06, - "loss": 0.0706, - "step": 1092 - }, - { - "epoch": 19.87272727272727, - "grad_norm": 1.8790242671966553, - "learning_rate": 6.02909090909091e-06, - "loss": 0.0504, - "step": 1093 - }, - { - "epoch": 19.89090909090909, - "grad_norm": 0.7606863975524902, - "learning_rate": 6.025454545454546e-06, - "loss": 0.0642, - "step": 1094 - }, - { - "epoch": 19.90909090909091, - "grad_norm": 2.1545188426971436, - "learning_rate": 6.021818181818182e-06, - "loss": 0.0554, - "step": 1095 - }, - { - "epoch": 19.927272727272726, - "grad_norm": 0.5745757222175598, - "learning_rate": 6.018181818181818e-06, - "loss": 0.0676, - "step": 1096 - }, - { - "epoch": 19.945454545454545, - "grad_norm": 1.1261132955551147, - "learning_rate": 6.014545454545455e-06, - "loss": 0.0607, - "step": 1097 - }, - { - "epoch": 19.963636363636365, - "grad_norm": 1.2129350900650024, - "learning_rate": 6.010909090909092e-06, - "loss": 0.0601, - "step": 1098 - }, - { - "epoch": 19.98181818181818, - "grad_norm": 0.41379401087760925, - "learning_rate": 6.0072727272727284e-06, - "loss": 0.0684, - "step": 1099 - }, - { - "epoch": 20.0, - "grad_norm": 0.5524367094039917, - "learning_rate": 6.0036363636363634e-06, - "loss": 0.0841, - "step": 1100 - }, - { - "epoch": 20.0, - "eval_loss": 0.05941665545105934, - "eval_runtime": 8.6029, - "eval_samples_per_second": 632.929, - "eval_steps_per_second": 79.16, - "step": 1100 - }, - { - "epoch": 20.01818181818182, - "grad_norm": 0.8829838633537292, - "learning_rate": 6e-06, - "loss": 0.0701, - "step": 1101 - }, - { - "epoch": 20.036363636363635, - "grad_norm": 1.0574616193771362, - "learning_rate": 5.996363636363637e-06, - "loss": 0.0672, - "step": 1102 - }, - { - "epoch": 20.054545454545455, - "grad_norm": 2.457736015319824, - "learning_rate": 5.9927272727272736e-06, - "loss": 0.0389, - "step": 1103 - }, - { - "epoch": 20.072727272727274, - "grad_norm": 2.225860834121704, - "learning_rate": 5.98909090909091e-06, - "loss": 0.0649, - "step": 1104 - }, - { - "epoch": 20.09090909090909, - "grad_norm": 1.5756968259811401, - "learning_rate": 5.985454545454545e-06, - "loss": 0.0696, - "step": 1105 - }, - { - "epoch": 20.10909090909091, - "grad_norm": 3.0596911907196045, - "learning_rate": 5.981818181818182e-06, - "loss": 0.0705, - "step": 1106 - }, - { - "epoch": 20.12727272727273, - "grad_norm": 1.1081348657608032, - "learning_rate": 5.978181818181819e-06, - "loss": 0.0619, - "step": 1107 - }, - { - "epoch": 20.145454545454545, - "grad_norm": 1.4318904876708984, - "learning_rate": 5.974545454545455e-06, - "loss": 0.0535, - "step": 1108 - }, - { - "epoch": 20.163636363636364, - "grad_norm": 1.5164518356323242, - "learning_rate": 5.970909090909091e-06, - "loss": 0.0726, - "step": 1109 - }, - { - "epoch": 20.181818181818183, - "grad_norm": 0.40812382102012634, - "learning_rate": 5.967272727272728e-06, - "loss": 0.0565, - "step": 1110 - }, - { - "epoch": 20.2, - "grad_norm": 0.4611378014087677, - "learning_rate": 5.963636363636364e-06, - "loss": 0.052, - "step": 1111 - }, - { - "epoch": 20.21818181818182, - "grad_norm": 2.695362091064453, - "learning_rate": 5.9600000000000005e-06, - "loss": 0.0515, - "step": 1112 - }, - { - "epoch": 20.236363636363638, - "grad_norm": 2.004549980163574, - "learning_rate": 5.956363636363636e-06, - "loss": 0.0584, - "step": 1113 - }, - { - "epoch": 20.254545454545454, - "grad_norm": 1.5494731664657593, - "learning_rate": 5.952727272727273e-06, - "loss": 0.0693, - "step": 1114 - }, - { - "epoch": 20.272727272727273, - "grad_norm": 3.3420751094818115, - "learning_rate": 5.94909090909091e-06, - "loss": 0.0466, - "step": 1115 - }, - { - "epoch": 20.29090909090909, - "grad_norm": 4.754481792449951, - "learning_rate": 5.9454545454545465e-06, - "loss": 0.0715, - "step": 1116 - }, - { - "epoch": 20.30909090909091, - "grad_norm": 6.009609699249268, - "learning_rate": 5.9418181818181815e-06, - "loss": 0.0835, - "step": 1117 - }, - { - "epoch": 20.327272727272728, - "grad_norm": 2.7596967220306396, - "learning_rate": 5.938181818181818e-06, - "loss": 0.0562, - "step": 1118 - }, - { - "epoch": 20.345454545454544, - "grad_norm": 0.4283748269081116, - "learning_rate": 5.934545454545455e-06, - "loss": 0.0609, - "step": 1119 - }, - { - "epoch": 20.363636363636363, - "grad_norm": 2.742699384689331, - "learning_rate": 5.930909090909092e-06, - "loss": 0.0671, - "step": 1120 - }, - { - "epoch": 20.381818181818183, - "grad_norm": 2.993436098098755, - "learning_rate": 5.927272727272728e-06, - "loss": 0.0508, - "step": 1121 - }, - { - "epoch": 20.4, - "grad_norm": 3.557907819747925, - "learning_rate": 5.923636363636363e-06, - "loss": 0.0743, - "step": 1122 - }, - { - "epoch": 20.418181818181818, - "grad_norm": 1.483481526374817, - "learning_rate": 5.92e-06, - "loss": 0.0544, - "step": 1123 - }, - { - "epoch": 20.436363636363637, - "grad_norm": 3.3564820289611816, - "learning_rate": 5.916363636363637e-06, - "loss": 0.0865, - "step": 1124 - }, - { - "epoch": 20.454545454545453, - "grad_norm": 6.795258522033691, - "learning_rate": 5.9127272727272735e-06, - "loss": 0.0767, - "step": 1125 - }, - { - "epoch": 20.472727272727273, - "grad_norm": 6.808108329772949, - "learning_rate": 5.90909090909091e-06, - "loss": 0.0867, - "step": 1126 - }, - { - "epoch": 20.490909090909092, - "grad_norm": 5.514426231384277, - "learning_rate": 5.905454545454546e-06, - "loss": 0.0902, - "step": 1127 - }, - { - "epoch": 20.509090909090908, - "grad_norm": 1.263844609260559, - "learning_rate": 5.901818181818182e-06, - "loss": 0.0583, - "step": 1128 - }, - { - "epoch": 20.527272727272727, - "grad_norm": 3.4835245609283447, - "learning_rate": 5.898181818181819e-06, - "loss": 0.0716, - "step": 1129 - }, - { - "epoch": 20.545454545454547, - "grad_norm": 5.74829626083374, - "learning_rate": 5.894545454545455e-06, - "loss": 0.0743, - "step": 1130 - }, - { - "epoch": 20.563636363636363, - "grad_norm": 5.130378723144531, - "learning_rate": 5.890909090909091e-06, - "loss": 0.0871, - "step": 1131 - }, - { - "epoch": 20.581818181818182, - "grad_norm": 4.256449222564697, - "learning_rate": 5.887272727272728e-06, - "loss": 0.0598, - "step": 1132 - }, - { - "epoch": 20.6, - "grad_norm": 0.9225368499755859, - "learning_rate": 5.883636363636364e-06, - "loss": 0.045, - "step": 1133 - }, - { - "epoch": 20.618181818181817, - "grad_norm": 2.8501386642456055, - "learning_rate": 5.8800000000000005e-06, - "loss": 0.0557, - "step": 1134 - }, - { - "epoch": 20.636363636363637, - "grad_norm": 3.854949712753296, - "learning_rate": 5.876363636363636e-06, - "loss": 0.0689, - "step": 1135 - }, - { - "epoch": 20.654545454545456, - "grad_norm": 3.235793113708496, - "learning_rate": 5.872727272727273e-06, - "loss": 0.0678, - "step": 1136 - }, - { - "epoch": 20.672727272727272, - "grad_norm": 3.3686914443969727, - "learning_rate": 5.86909090909091e-06, - "loss": 0.0436, - "step": 1137 - }, - { - "epoch": 20.69090909090909, - "grad_norm": 1.3421062231063843, - "learning_rate": 5.8654545454545464e-06, - "loss": 0.0565, - "step": 1138 - }, - { - "epoch": 20.70909090909091, - "grad_norm": 3.269624710083008, - "learning_rate": 5.8618181818181814e-06, - "loss": 0.0601, - "step": 1139 - }, - { - "epoch": 20.727272727272727, - "grad_norm": 1.3546231985092163, - "learning_rate": 5.858181818181818e-06, - "loss": 0.064, - "step": 1140 - }, - { - "epoch": 20.745454545454546, - "grad_norm": 1.481784701347351, - "learning_rate": 5.854545454545455e-06, - "loss": 0.0718, - "step": 1141 - }, - { - "epoch": 20.763636363636365, - "grad_norm": 3.7343342304229736, - "learning_rate": 5.8509090909090916e-06, - "loss": 0.0567, - "step": 1142 - }, - { - "epoch": 20.78181818181818, - "grad_norm": 5.3766255378723145, - "learning_rate": 5.847272727272728e-06, - "loss": 0.0544, - "step": 1143 - }, - { - "epoch": 20.8, - "grad_norm": 4.813802719116211, - "learning_rate": 5.843636363636365e-06, - "loss": 0.0644, - "step": 1144 - }, - { - "epoch": 20.818181818181817, - "grad_norm": 2.7984681129455566, - "learning_rate": 5.84e-06, - "loss": 0.0539, - "step": 1145 - }, - { - "epoch": 20.836363636363636, - "grad_norm": 0.22357964515686035, - "learning_rate": 5.836363636363637e-06, - "loss": 0.0556, - "step": 1146 - }, - { - "epoch": 20.854545454545455, - "grad_norm": 3.5638039112091064, - "learning_rate": 5.832727272727273e-06, - "loss": 0.0714, - "step": 1147 - }, - { - "epoch": 20.87272727272727, - "grad_norm": 5.489165782928467, - "learning_rate": 5.82909090909091e-06, - "loss": 0.0801, - "step": 1148 - }, - { - "epoch": 20.89090909090909, - "grad_norm": 5.3347697257995605, - "learning_rate": 5.825454545454546e-06, - "loss": 0.081, - "step": 1149 - }, - { - "epoch": 20.90909090909091, - "grad_norm": 2.573688268661499, - "learning_rate": 5.821818181818182e-06, - "loss": 0.0688, - "step": 1150 - }, - { - "epoch": 20.927272727272726, - "grad_norm": 1.142368197441101, - "learning_rate": 5.8181818181818185e-06, - "loss": 0.0645, - "step": 1151 - }, - { - "epoch": 20.945454545454545, - "grad_norm": 3.1243460178375244, - "learning_rate": 5.814545454545455e-06, - "loss": 0.0628, - "step": 1152 - }, - { - "epoch": 20.963636363636365, - "grad_norm": 2.5272185802459717, - "learning_rate": 5.810909090909091e-06, - "loss": 0.0616, - "step": 1153 - }, - { - "epoch": 20.98181818181818, - "grad_norm": 0.8083441257476807, - "learning_rate": 5.807272727272728e-06, - "loss": 0.0508, - "step": 1154 - }, - { - "epoch": 21.0, - "grad_norm": 1.1498531103134155, - "learning_rate": 5.8036363636363645e-06, - "loss": 0.0606, - "step": 1155 - }, - { - "epoch": 21.0, - "eval_loss": 0.06074769049882889, - "eval_runtime": 9.3967, - "eval_samples_per_second": 579.458, - "eval_steps_per_second": 72.472, - "step": 1155 - }, - { - "epoch": 21.01818181818182, - "grad_norm": 2.352652072906494, - "learning_rate": 5.8e-06, - "loss": 0.0507, - "step": 1156 - }, - { - "epoch": 21.036363636363635, - "grad_norm": 1.8719052076339722, - "learning_rate": 5.796363636363636e-06, - "loss": 0.0611, - "step": 1157 - }, - { - "epoch": 21.054545454545455, - "grad_norm": 0.3174591660499573, - "learning_rate": 5.792727272727273e-06, - "loss": 0.0537, - "step": 1158 - }, - { - "epoch": 21.072727272727274, - "grad_norm": 1.2155582904815674, - "learning_rate": 5.78909090909091e-06, - "loss": 0.059, - "step": 1159 - }, - { - "epoch": 21.09090909090909, - "grad_norm": 3.7071805000305176, - "learning_rate": 5.785454545454546e-06, - "loss": 0.0393, - "step": 1160 - }, - { - "epoch": 21.10909090909091, - "grad_norm": 0.6900514364242554, - "learning_rate": 5.781818181818181e-06, - "loss": 0.0527, - "step": 1161 - }, - { - "epoch": 21.12727272727273, - "grad_norm": 0.7898769378662109, - "learning_rate": 5.778181818181818e-06, - "loss": 0.06, - "step": 1162 - }, - { - "epoch": 21.145454545454545, - "grad_norm": 2.392285108566284, - "learning_rate": 5.774545454545455e-06, - "loss": 0.0773, - "step": 1163 - }, - { - "epoch": 21.163636363636364, - "grad_norm": 0.30404406785964966, - "learning_rate": 5.7709090909090915e-06, - "loss": 0.0591, - "step": 1164 - }, - { - "epoch": 21.181818181818183, - "grad_norm": 0.44263848662376404, - "learning_rate": 5.767272727272728e-06, - "loss": 0.0664, - "step": 1165 - }, - { - "epoch": 21.2, - "grad_norm": 1.3811010122299194, - "learning_rate": 5.763636363636365e-06, - "loss": 0.0601, - "step": 1166 - }, - { - "epoch": 21.21818181818182, - "grad_norm": 1.6902885437011719, - "learning_rate": 5.76e-06, - "loss": 0.0419, - "step": 1167 - }, - { - "epoch": 21.236363636363638, - "grad_norm": 1.3157438039779663, - "learning_rate": 5.756363636363637e-06, - "loss": 0.0534, - "step": 1168 - }, - { - "epoch": 21.254545454545454, - "grad_norm": 1.9853918552398682, - "learning_rate": 5.752727272727273e-06, - "loss": 0.0609, - "step": 1169 - }, - { - "epoch": 21.272727272727273, - "grad_norm": 0.5625996589660645, - "learning_rate": 5.74909090909091e-06, - "loss": 0.0435, - "step": 1170 - }, - { - "epoch": 21.29090909090909, - "grad_norm": 2.2935469150543213, - "learning_rate": 5.745454545454546e-06, - "loss": 0.0606, - "step": 1171 - }, - { - "epoch": 21.30909090909091, - "grad_norm": 3.4048075675964355, - "learning_rate": 5.741818181818182e-06, - "loss": 0.0578, - "step": 1172 - }, - { - "epoch": 21.327272727272728, - "grad_norm": 1.5226157903671265, - "learning_rate": 5.7381818181818185e-06, - "loss": 0.0704, - "step": 1173 - }, - { - "epoch": 21.345454545454544, - "grad_norm": 1.4347758293151855, - "learning_rate": 5.734545454545455e-06, - "loss": 0.0646, - "step": 1174 - }, - { - "epoch": 21.363636363636363, - "grad_norm": 2.0696940422058105, - "learning_rate": 5.730909090909091e-06, - "loss": 0.0514, - "step": 1175 - }, - { - "epoch": 21.381818181818183, - "grad_norm": 1.5383201837539673, - "learning_rate": 5.727272727272728e-06, - "loss": 0.0703, - "step": 1176 - }, - { - "epoch": 21.4, - "grad_norm": 0.46336036920547485, - "learning_rate": 5.7236363636363644e-06, - "loss": 0.054, - "step": 1177 - }, - { - "epoch": 21.418181818181818, - "grad_norm": 1.9309418201446533, - "learning_rate": 5.72e-06, - "loss": 0.055, - "step": 1178 - }, - { - "epoch": 21.436363636363637, - "grad_norm": 2.9514291286468506, - "learning_rate": 5.716363636363636e-06, - "loss": 0.0469, - "step": 1179 - }, - { - "epoch": 21.454545454545453, - "grad_norm": 0.5202154517173767, - "learning_rate": 5.712727272727273e-06, - "loss": 0.056, - "step": 1180 - }, - { - "epoch": 21.472727272727273, - "grad_norm": 1.6783214807510376, - "learning_rate": 5.7090909090909096e-06, - "loss": 0.0672, - "step": 1181 - }, - { - "epoch": 21.490909090909092, - "grad_norm": 3.6922507286071777, - "learning_rate": 5.705454545454546e-06, - "loss": 0.0723, - "step": 1182 - }, - { - "epoch": 21.509090909090908, - "grad_norm": 1.4448970556259155, - "learning_rate": 5.701818181818183e-06, - "loss": 0.0593, - "step": 1183 - }, - { - "epoch": 21.527272727272727, - "grad_norm": 0.3615138530731201, - "learning_rate": 5.698181818181818e-06, - "loss": 0.0618, - "step": 1184 - }, - { - "epoch": 21.545454545454547, - "grad_norm": 3.6457300186157227, - "learning_rate": 5.694545454545455e-06, - "loss": 0.0485, - "step": 1185 - }, - { - "epoch": 21.563636363636363, - "grad_norm": 2.8168511390686035, - "learning_rate": 5.690909090909091e-06, - "loss": 0.0787, - "step": 1186 - }, - { - "epoch": 21.581818181818182, - "grad_norm": 3.1251046657562256, - "learning_rate": 5.687272727272728e-06, - "loss": 0.0618, - "step": 1187 - }, - { - "epoch": 21.6, - "grad_norm": 0.5187599062919617, - "learning_rate": 5.683636363636365e-06, - "loss": 0.061, - "step": 1188 - }, - { - "epoch": 21.618181818181817, - "grad_norm": 1.9841852188110352, - "learning_rate": 5.68e-06, - "loss": 0.0786, - "step": 1189 - }, - { - "epoch": 21.636363636363637, - "grad_norm": 4.121726036071777, - "learning_rate": 5.6763636363636365e-06, - "loss": 0.062, - "step": 1190 - }, - { - "epoch": 21.654545454545456, - "grad_norm": 2.8835086822509766, - "learning_rate": 5.672727272727273e-06, - "loss": 0.0786, - "step": 1191 - }, - { - "epoch": 21.672727272727272, - "grad_norm": 1.3999583721160889, - "learning_rate": 5.66909090909091e-06, - "loss": 0.0671, - "step": 1192 - }, - { - "epoch": 21.69090909090909, - "grad_norm": 3.6728017330169678, - "learning_rate": 5.665454545454546e-06, - "loss": 0.0579, - "step": 1193 - }, - { - "epoch": 21.70909090909091, - "grad_norm": 5.138390064239502, - "learning_rate": 5.6618181818181825e-06, - "loss": 0.0868, - "step": 1194 - }, - { - "epoch": 21.727272727272727, - "grad_norm": 4.319202423095703, - "learning_rate": 5.658181818181818e-06, - "loss": 0.0697, - "step": 1195 - }, - { - "epoch": 21.745454545454546, - "grad_norm": 2.650716781616211, - "learning_rate": 5.654545454545455e-06, - "loss": 0.077, - "step": 1196 - }, - { - "epoch": 21.763636363636365, - "grad_norm": 0.3616243302822113, - "learning_rate": 5.650909090909091e-06, - "loss": 0.0658, - "step": 1197 - }, - { - "epoch": 21.78181818181818, - "grad_norm": 1.2222181558609009, - "learning_rate": 5.647272727272728e-06, - "loss": 0.0591, - "step": 1198 - }, - { - "epoch": 21.8, - "grad_norm": 1.0295571088790894, - "learning_rate": 5.643636363636364e-06, - "loss": 0.0503, - "step": 1199 - }, - { - "epoch": 21.818181818181817, - "grad_norm": 1.2890013456344604, - "learning_rate": 5.64e-06, - "loss": 0.0593, - "step": 1200 - }, - { - "epoch": 21.836363636363636, - "grad_norm": 0.6108123064041138, - "learning_rate": 5.636363636363636e-06, - "loss": 0.0576, - "step": 1201 - }, - { - "epoch": 21.854545454545455, - "grad_norm": 3.6384994983673096, - "learning_rate": 5.632727272727273e-06, - "loss": 0.048, - "step": 1202 - }, - { - "epoch": 21.87272727272727, - "grad_norm": 3.7728431224823, - "learning_rate": 5.6290909090909095e-06, - "loss": 0.0694, - "step": 1203 - }, - { - "epoch": 21.89090909090909, - "grad_norm": 1.3883213996887207, - "learning_rate": 5.625454545454546e-06, - "loss": 0.0884, - "step": 1204 - }, - { - "epoch": 21.90909090909091, - "grad_norm": 1.1278208494186401, - "learning_rate": 5.621818181818183e-06, - "loss": 0.0742, - "step": 1205 - }, - { - "epoch": 21.927272727272726, - "grad_norm": 3.5175912380218506, - "learning_rate": 5.618181818181818e-06, - "loss": 0.0636, - "step": 1206 - }, - { - "epoch": 21.945454545454545, - "grad_norm": 3.9901390075683594, - "learning_rate": 5.614545454545455e-06, - "loss": 0.069, - "step": 1207 - }, - { - "epoch": 21.963636363636365, - "grad_norm": 4.796106815338135, - "learning_rate": 5.610909090909091e-06, - "loss": 0.0626, - "step": 1208 - }, - { - "epoch": 21.98181818181818, - "grad_norm": 1.0715789794921875, - "learning_rate": 5.607272727272728e-06, - "loss": 0.0528, - "step": 1209 - }, - { - "epoch": 22.0, - "grad_norm": 1.5520519018173218, - "learning_rate": 5.603636363636365e-06, - "loss": 0.0691, - "step": 1210 - }, - { - "epoch": 22.0, - "eval_loss": 0.06051219254732132, - "eval_runtime": 8.968, - "eval_samples_per_second": 607.161, - "eval_steps_per_second": 75.937, - "step": 1210 - }, - { - "epoch": 22.01818181818182, - "grad_norm": 2.101107120513916, - "learning_rate": 5.600000000000001e-06, - "loss": 0.0647, - "step": 1211 - }, - { - "epoch": 22.036363636363635, - "grad_norm": 3.125864028930664, - "learning_rate": 5.5963636363636365e-06, - "loss": 0.0631, - "step": 1212 - }, - { - "epoch": 22.054545454545455, - "grad_norm": 1.0819345712661743, - "learning_rate": 5.592727272727273e-06, - "loss": 0.0543, - "step": 1213 - }, - { - "epoch": 22.072727272727274, - "grad_norm": 1.488279938697815, - "learning_rate": 5.58909090909091e-06, - "loss": 0.0645, - "step": 1214 - }, - { - "epoch": 22.09090909090909, - "grad_norm": 4.718318462371826, - "learning_rate": 5.585454545454546e-06, - "loss": 0.0799, - "step": 1215 - }, - { - "epoch": 22.10909090909091, - "grad_norm": 2.8014864921569824, - "learning_rate": 5.5818181818181824e-06, - "loss": 0.0586, - "step": 1216 - }, - { - "epoch": 22.12727272727273, - "grad_norm": 0.8074561357498169, - "learning_rate": 5.578181818181818e-06, - "loss": 0.0564, - "step": 1217 - }, - { - "epoch": 22.145454545454545, - "grad_norm": 0.29516083002090454, - "learning_rate": 5.574545454545455e-06, - "loss": 0.0643, - "step": 1218 - }, - { - "epoch": 22.163636363636364, - "grad_norm": 3.3853073120117188, - "learning_rate": 5.570909090909091e-06, - "loss": 0.0767, - "step": 1219 - }, - { - "epoch": 22.181818181818183, - "grad_norm": 3.3835668563842773, - "learning_rate": 5.5672727272727276e-06, - "loss": 0.0665, - "step": 1220 - }, - { - "epoch": 22.2, - "grad_norm": 3.432788372039795, - "learning_rate": 5.563636363636364e-06, - "loss": 0.0428, - "step": 1221 - }, - { - "epoch": 22.21818181818182, - "grad_norm": 0.250907838344574, - "learning_rate": 5.560000000000001e-06, - "loss": 0.0549, - "step": 1222 - }, - { - "epoch": 22.236363636363638, - "grad_norm": 2.936164140701294, - "learning_rate": 5.556363636363636e-06, - "loss": 0.0742, - "step": 1223 - }, - { - "epoch": 22.254545454545454, - "grad_norm": 3.368711471557617, - "learning_rate": 5.552727272727273e-06, - "loss": 0.0565, - "step": 1224 - }, - { - "epoch": 22.272727272727273, - "grad_norm": 2.9334664344787598, - "learning_rate": 5.549090909090909e-06, - "loss": 0.0574, - "step": 1225 - }, - { - "epoch": 22.29090909090909, - "grad_norm": 1.1532365083694458, - "learning_rate": 5.545454545454546e-06, - "loss": 0.0633, - "step": 1226 - }, - { - "epoch": 22.30909090909091, - "grad_norm": 4.067857265472412, - "learning_rate": 5.541818181818183e-06, - "loss": 0.0721, - "step": 1227 - }, - { - "epoch": 22.327272727272728, - "grad_norm": 4.622032642364502, - "learning_rate": 5.538181818181818e-06, - "loss": 0.0751, - "step": 1228 - }, - { - "epoch": 22.345454545454544, - "grad_norm": 4.922406196594238, - "learning_rate": 5.5345454545454545e-06, - "loss": 0.0671, - "step": 1229 - }, - { - "epoch": 22.363636363636363, - "grad_norm": 3.706799030303955, - "learning_rate": 5.530909090909091e-06, - "loss": 0.0775, - "step": 1230 - }, - { - "epoch": 22.381818181818183, - "grad_norm": 0.9020352363586426, - "learning_rate": 5.527272727272728e-06, - "loss": 0.0576, - "step": 1231 - }, - { - "epoch": 22.4, - "grad_norm": 2.585310220718384, - "learning_rate": 5.523636363636365e-06, - "loss": 0.0514, - "step": 1232 - }, - { - "epoch": 22.418181818181818, - "grad_norm": 4.79132080078125, - "learning_rate": 5.5200000000000005e-06, - "loss": 0.0642, - "step": 1233 - }, - { - "epoch": 22.436363636363637, - "grad_norm": 4.77208948135376, - "learning_rate": 5.516363636363636e-06, - "loss": 0.0471, - "step": 1234 - }, - { - "epoch": 22.454545454545453, - "grad_norm": 3.5736308097839355, - "learning_rate": 5.512727272727273e-06, - "loss": 0.0551, - "step": 1235 - }, - { - "epoch": 22.472727272727273, - "grad_norm": 2.11493182182312, - "learning_rate": 5.50909090909091e-06, - "loss": 0.0595, - "step": 1236 - }, - { - "epoch": 22.490909090909092, - "grad_norm": 4.053482532501221, - "learning_rate": 5.505454545454546e-06, - "loss": 0.0555, - "step": 1237 - }, - { - "epoch": 22.509090909090908, - "grad_norm": 4.4718098640441895, - "learning_rate": 5.501818181818182e-06, - "loss": 0.0597, - "step": 1238 - }, - { - "epoch": 22.527272727272727, - "grad_norm": 5.377930641174316, - "learning_rate": 5.498181818181819e-06, - "loss": 0.0725, - "step": 1239 - }, - { - "epoch": 22.545454545454547, - "grad_norm": 3.486990451812744, - "learning_rate": 5.494545454545455e-06, - "loss": 0.0631, - "step": 1240 - }, - { - "epoch": 22.563636363636363, - "grad_norm": 0.948314368724823, - "learning_rate": 5.490909090909091e-06, - "loss": 0.063, - "step": 1241 - }, - { - "epoch": 22.581818181818182, - "grad_norm": 3.5503602027893066, - "learning_rate": 5.4872727272727275e-06, - "loss": 0.0829, - "step": 1242 - }, - { - "epoch": 22.6, - "grad_norm": 5.793842315673828, - "learning_rate": 5.483636363636364e-06, - "loss": 0.0723, - "step": 1243 - }, - { - "epoch": 22.618181818181817, - "grad_norm": 6.176164627075195, - "learning_rate": 5.480000000000001e-06, - "loss": 0.087, - "step": 1244 - }, - { - "epoch": 22.636363636363637, - "grad_norm": 3.8878672122955322, - "learning_rate": 5.476363636363636e-06, - "loss": 0.0796, - "step": 1245 - }, - { - "epoch": 22.654545454545456, - "grad_norm": 0.8583476543426514, - "learning_rate": 5.472727272727273e-06, - "loss": 0.069, - "step": 1246 - }, - { - "epoch": 22.672727272727272, - "grad_norm": 3.9922831058502197, - "learning_rate": 5.469090909090909e-06, - "loss": 0.0637, - "step": 1247 - }, - { - "epoch": 22.69090909090909, - "grad_norm": 4.098411560058594, - "learning_rate": 5.465454545454546e-06, - "loss": 0.0658, - "step": 1248 - }, - { - "epoch": 22.70909090909091, - "grad_norm": 4.442993640899658, - "learning_rate": 5.461818181818183e-06, - "loss": 0.0783, - "step": 1249 - }, - { - "epoch": 22.727272727272727, - "grad_norm": 3.029344320297241, - "learning_rate": 5.4581818181818194e-06, - "loss": 0.063, - "step": 1250 - }, - { - "epoch": 22.745454545454546, - "grad_norm": 0.4310535192489624, - "learning_rate": 5.4545454545454545e-06, - "loss": 0.0671, - "step": 1251 - }, - { - "epoch": 22.763636363636365, - "grad_norm": 2.4344253540039062, - "learning_rate": 5.450909090909091e-06, - "loss": 0.049, - "step": 1252 - }, - { - "epoch": 22.78181818181818, - "grad_norm": 3.6528236865997314, - "learning_rate": 5.447272727272728e-06, - "loss": 0.0627, - "step": 1253 - }, - { - "epoch": 22.8, - "grad_norm": 3.2683517932891846, - "learning_rate": 5.4436363636363646e-06, - "loss": 0.072, - "step": 1254 - }, - { - "epoch": 22.818181818181817, - "grad_norm": 2.2099926471710205, - "learning_rate": 5.4400000000000004e-06, - "loss": 0.0601, - "step": 1255 - }, - { - "epoch": 22.836363636363636, - "grad_norm": 1.9511268138885498, - "learning_rate": 5.436363636363636e-06, - "loss": 0.064, - "step": 1256 - }, - { - "epoch": 22.854545454545455, - "grad_norm": 3.043828010559082, - "learning_rate": 5.432727272727273e-06, - "loss": 0.0432, - "step": 1257 - }, - { - "epoch": 22.87272727272727, - "grad_norm": 2.4150049686431885, - "learning_rate": 5.42909090909091e-06, - "loss": 0.0739, - "step": 1258 - }, - { - "epoch": 22.89090909090909, - "grad_norm": 0.2394784837961197, - "learning_rate": 5.4254545454545456e-06, - "loss": 0.0635, - "step": 1259 - }, - { - "epoch": 22.90909090909091, - "grad_norm": 0.9627916812896729, - "learning_rate": 5.421818181818182e-06, - "loss": 0.0558, - "step": 1260 - }, - { - "epoch": 22.927272727272726, - "grad_norm": 0.6863628625869751, - "learning_rate": 5.418181818181819e-06, - "loss": 0.0468, - "step": 1261 - }, - { - "epoch": 22.945454545454545, - "grad_norm": 2.698134183883667, - "learning_rate": 5.414545454545455e-06, - "loss": 0.0672, - "step": 1262 - }, - { - "epoch": 22.963636363636365, - "grad_norm": 1.8558815717697144, - "learning_rate": 5.410909090909091e-06, - "loss": 0.0501, - "step": 1263 - }, - { - "epoch": 22.98181818181818, - "grad_norm": 2.037086248397827, - "learning_rate": 5.407272727272727e-06, - "loss": 0.05, - "step": 1264 - }, - { - "epoch": 23.0, - "grad_norm": 2.342301607131958, - "learning_rate": 5.403636363636364e-06, - "loss": 0.0623, - "step": 1265 - }, - { - "epoch": 23.0, - "eval_loss": 0.05992429330945015, - "eval_runtime": 8.9082, - "eval_samples_per_second": 611.236, - "eval_steps_per_second": 76.447, - "step": 1265 - }, - { - "epoch": 23.01818181818182, - "grad_norm": 1.6867561340332031, - "learning_rate": 5.400000000000001e-06, - "loss": 0.0626, - "step": 1266 - }, - { - "epoch": 23.036363636363635, - "grad_norm": 2.4248647689819336, - "learning_rate": 5.3963636363636375e-06, - "loss": 0.058, - "step": 1267 - }, - { - "epoch": 23.054545454545455, - "grad_norm": 1.175683617591858, - "learning_rate": 5.3927272727272725e-06, - "loss": 0.0508, - "step": 1268 - }, - { - "epoch": 23.072727272727274, - "grad_norm": 0.8181573152542114, - "learning_rate": 5.389090909090909e-06, - "loss": 0.0612, - "step": 1269 - }, - { - "epoch": 23.09090909090909, - "grad_norm": 2.6875739097595215, - "learning_rate": 5.385454545454546e-06, - "loss": 0.0712, - "step": 1270 - }, - { - "epoch": 23.10909090909091, - "grad_norm": 4.313840389251709, - "learning_rate": 5.381818181818183e-06, - "loss": 0.0583, - "step": 1271 - }, - { - "epoch": 23.12727272727273, - "grad_norm": 5.156050205230713, - "learning_rate": 5.378181818181819e-06, - "loss": 0.079, - "step": 1272 - }, - { - "epoch": 23.145454545454545, - "grad_norm": 2.718174934387207, - "learning_rate": 5.374545454545454e-06, - "loss": 0.0572, - "step": 1273 - }, - { - "epoch": 23.163636363636364, - "grad_norm": 0.6573500633239746, - "learning_rate": 5.370909090909091e-06, - "loss": 0.0549, - "step": 1274 - }, - { - "epoch": 23.181818181818183, - "grad_norm": 1.348832368850708, - "learning_rate": 5.367272727272728e-06, - "loss": 0.0402, - "step": 1275 - }, - { - "epoch": 23.2, - "grad_norm": 1.9660660028457642, - "learning_rate": 5.3636363636363645e-06, - "loss": 0.0684, - "step": 1276 - }, - { - "epoch": 23.21818181818182, - "grad_norm": 1.9743163585662842, - "learning_rate": 5.36e-06, - "loss": 0.0604, - "step": 1277 - }, - { - "epoch": 23.236363636363638, - "grad_norm": 1.0946769714355469, - "learning_rate": 5.356363636363637e-06, - "loss": 0.0544, - "step": 1278 - }, - { - "epoch": 23.254545454545454, - "grad_norm": 2.986731767654419, - "learning_rate": 5.352727272727273e-06, - "loss": 0.0566, - "step": 1279 - }, - { - "epoch": 23.272727272727273, - "grad_norm": 2.5351672172546387, - "learning_rate": 5.34909090909091e-06, - "loss": 0.0567, - "step": 1280 - }, - { - "epoch": 23.29090909090909, - "grad_norm": 1.1009401082992554, - "learning_rate": 5.3454545454545455e-06, - "loss": 0.0663, - "step": 1281 - }, - { - "epoch": 23.30909090909091, - "grad_norm": 0.5251254439353943, - "learning_rate": 5.341818181818182e-06, - "loss": 0.0426, - "step": 1282 - }, - { - "epoch": 23.327272727272728, - "grad_norm": 0.8479986786842346, - "learning_rate": 5.338181818181819e-06, - "loss": 0.0604, - "step": 1283 - }, - { - "epoch": 23.345454545454544, - "grad_norm": 0.7918610572814941, - "learning_rate": 5.334545454545455e-06, - "loss": 0.0627, - "step": 1284 - }, - { - "epoch": 23.363636363636363, - "grad_norm": 0.7284315228462219, - "learning_rate": 5.330909090909091e-06, - "loss": 0.0629, - "step": 1285 - }, - { - "epoch": 23.381818181818183, - "grad_norm": 0.8875734210014343, - "learning_rate": 5.327272727272727e-06, - "loss": 0.0694, - "step": 1286 - }, - { - "epoch": 23.4, - "grad_norm": 1.3634393215179443, - "learning_rate": 5.323636363636364e-06, - "loss": 0.0457, - "step": 1287 - }, - { - "epoch": 23.418181818181818, - "grad_norm": 2.2585742473602295, - "learning_rate": 5.320000000000001e-06, - "loss": 0.068, - "step": 1288 - }, - { - "epoch": 23.436363636363637, - "grad_norm": 1.8118019104003906, - "learning_rate": 5.3163636363636374e-06, - "loss": 0.0753, - "step": 1289 - }, - { - "epoch": 23.454545454545453, - "grad_norm": 2.7583868503570557, - "learning_rate": 5.3127272727272725e-06, - "loss": 0.0529, - "step": 1290 - }, - { - "epoch": 23.472727272727273, - "grad_norm": 2.4681882858276367, - "learning_rate": 5.309090909090909e-06, - "loss": 0.0631, - "step": 1291 - }, - { - "epoch": 23.490909090909092, - "grad_norm": 1.6527776718139648, - "learning_rate": 5.305454545454546e-06, - "loss": 0.0695, - "step": 1292 - }, - { - "epoch": 23.509090909090908, - "grad_norm": 0.40958172082901, - "learning_rate": 5.3018181818181826e-06, - "loss": 0.0613, - "step": 1293 - }, - { - "epoch": 23.527272727272727, - "grad_norm": 1.9082196950912476, - "learning_rate": 5.298181818181819e-06, - "loss": 0.0648, - "step": 1294 - }, - { - "epoch": 23.545454545454547, - "grad_norm": 1.323943853378296, - "learning_rate": 5.294545454545454e-06, - "loss": 0.064, - "step": 1295 - }, - { - "epoch": 23.563636363636363, - "grad_norm": 0.7830854058265686, - "learning_rate": 5.290909090909091e-06, - "loss": 0.0553, - "step": 1296 - }, - { - "epoch": 23.581818181818182, - "grad_norm": 0.3010132908821106, - "learning_rate": 5.287272727272728e-06, - "loss": 0.0553, - "step": 1297 - }, - { - "epoch": 23.6, - "grad_norm": 0.8094121813774109, - "learning_rate": 5.283636363636364e-06, - "loss": 0.071, - "step": 1298 - }, - { - "epoch": 23.618181818181817, - "grad_norm": 0.9555163979530334, - "learning_rate": 5.28e-06, - "loss": 0.0636, - "step": 1299 - }, - { - "epoch": 23.636363636363637, - "grad_norm": 0.14719438552856445, - "learning_rate": 5.276363636363637e-06, - "loss": 0.0558, - "step": 1300 - }, - { - "epoch": 23.654545454545456, - "grad_norm": 0.8570860624313354, - "learning_rate": 5.272727272727273e-06, - "loss": 0.0571, - "step": 1301 - }, - { - "epoch": 23.672727272727272, - "grad_norm": 0.6160054206848145, - "learning_rate": 5.2690909090909095e-06, - "loss": 0.0624, - "step": 1302 - }, - { - "epoch": 23.69090909090909, - "grad_norm": 0.6094885468482971, - "learning_rate": 5.265454545454545e-06, - "loss": 0.0572, - "step": 1303 - }, - { - "epoch": 23.70909090909091, - "grad_norm": 0.4078752398490906, - "learning_rate": 5.261818181818182e-06, - "loss": 0.0515, - "step": 1304 - }, - { - "epoch": 23.727272727272727, - "grad_norm": 1.6351726055145264, - "learning_rate": 5.258181818181819e-06, - "loss": 0.0794, - "step": 1305 - }, - { - "epoch": 23.745454545454546, - "grad_norm": 0.6916987895965576, - "learning_rate": 5.2545454545454555e-06, - "loss": 0.0686, - "step": 1306 - }, - { - "epoch": 23.763636363636365, - "grad_norm": 2.38862943649292, - "learning_rate": 5.2509090909090905e-06, - "loss": 0.0659, - "step": 1307 - }, - { - "epoch": 23.78181818181818, - "grad_norm": 2.493891954421997, - "learning_rate": 5.247272727272727e-06, - "loss": 0.0646, - "step": 1308 - }, - { - "epoch": 23.8, - "grad_norm": 1.6728436946868896, - "learning_rate": 5.243636363636364e-06, - "loss": 0.0569, - "step": 1309 - }, - { - "epoch": 23.818181818181817, - "grad_norm": 0.9717400670051575, - "learning_rate": 5.240000000000001e-06, - "loss": 0.0699, - "step": 1310 - }, - { - "epoch": 23.836363636363636, - "grad_norm": 0.6820631623268127, - "learning_rate": 5.236363636363637e-06, - "loss": 0.048, - "step": 1311 - }, - { - "epoch": 23.854545454545455, - "grad_norm": 1.757928729057312, - "learning_rate": 5.232727272727272e-06, - "loss": 0.0754, - "step": 1312 - }, - { - "epoch": 23.87272727272727, - "grad_norm": 1.10893714427948, - "learning_rate": 5.229090909090909e-06, - "loss": 0.0644, - "step": 1313 - }, - { - "epoch": 23.89090909090909, - "grad_norm": 1.5126618146896362, - "learning_rate": 5.225454545454546e-06, - "loss": 0.0607, - "step": 1314 - }, - { - "epoch": 23.90909090909091, - "grad_norm": 1.4888091087341309, - "learning_rate": 5.2218181818181825e-06, - "loss": 0.061, - "step": 1315 - }, - { - "epoch": 23.927272727272726, - "grad_norm": 2.6280102729797363, - "learning_rate": 5.218181818181819e-06, - "loss": 0.0784, - "step": 1316 - }, - { - "epoch": 23.945454545454545, - "grad_norm": 1.1761093139648438, - "learning_rate": 5.214545454545455e-06, - "loss": 0.0438, - "step": 1317 - }, - { - "epoch": 23.963636363636365, - "grad_norm": 0.9583834409713745, - "learning_rate": 5.210909090909091e-06, - "loss": 0.0611, - "step": 1318 - }, - { - "epoch": 23.98181818181818, - "grad_norm": 1.125480055809021, - "learning_rate": 5.207272727272728e-06, - "loss": 0.0495, - "step": 1319 - }, - { - "epoch": 24.0, - "grad_norm": 1.0283899307250977, - "learning_rate": 5.203636363636364e-06, - "loss": 0.0461, - "step": 1320 - }, - { - "epoch": 24.0, - "eval_loss": 0.061738040298223495, - "eval_runtime": 9.1332, - "eval_samples_per_second": 596.176, - "eval_steps_per_second": 74.563, - "step": 1320 - }, - { - "epoch": 24.01818181818182, - "grad_norm": 3.4393200874328613, - "learning_rate": 5.2e-06, - "loss": 0.0493, - "step": 1321 - }, - { - "epoch": 24.036363636363635, - "grad_norm": 3.344987392425537, - "learning_rate": 5.196363636363637e-06, - "loss": 0.0757, - "step": 1322 - }, - { - "epoch": 24.054545454545455, - "grad_norm": 0.43721094727516174, - "learning_rate": 5.192727272727273e-06, - "loss": 0.0569, - "step": 1323 - }, - { - "epoch": 24.072727272727274, - "grad_norm": 1.2755212783813477, - "learning_rate": 5.1890909090909095e-06, - "loss": 0.0712, - "step": 1324 - }, - { - "epoch": 24.09090909090909, - "grad_norm": 3.52242112159729, - "learning_rate": 5.185454545454545e-06, - "loss": 0.0546, - "step": 1325 - }, - { - "epoch": 24.10909090909091, - "grad_norm": 2.076991558074951, - "learning_rate": 5.181818181818182e-06, - "loss": 0.0555, - "step": 1326 - }, - { - "epoch": 24.12727272727273, - "grad_norm": 0.5251204967498779, - "learning_rate": 5.178181818181819e-06, - "loss": 0.0562, - "step": 1327 - }, - { - "epoch": 24.145454545454545, - "grad_norm": 3.0064516067504883, - "learning_rate": 5.1745454545454554e-06, - "loss": 0.0653, - "step": 1328 - }, - { - "epoch": 24.163636363636364, - "grad_norm": 1.7887390851974487, - "learning_rate": 5.170909090909091e-06, - "loss": 0.0533, - "step": 1329 - }, - { - "epoch": 24.181818181818183, - "grad_norm": 0.6104230880737305, - "learning_rate": 5.167272727272727e-06, - "loss": 0.0524, - "step": 1330 - }, - { - "epoch": 24.2, - "grad_norm": 0.5626111626625061, - "learning_rate": 5.163636363636364e-06, - "loss": 0.0549, - "step": 1331 - }, - { - "epoch": 24.21818181818182, - "grad_norm": 1.4972851276397705, - "learning_rate": 5.1600000000000006e-06, - "loss": 0.0492, - "step": 1332 - }, - { - "epoch": 24.236363636363638, - "grad_norm": 1.8624346256256104, - "learning_rate": 5.156363636363637e-06, - "loss": 0.0657, - "step": 1333 - }, - { - "epoch": 24.254545454545454, - "grad_norm": 1.134390115737915, - "learning_rate": 5.152727272727274e-06, - "loss": 0.0788, - "step": 1334 - }, - { - "epoch": 24.272727272727273, - "grad_norm": 0.9516750574111938, - "learning_rate": 5.149090909090909e-06, - "loss": 0.0626, - "step": 1335 - }, - { - "epoch": 24.29090909090909, - "grad_norm": 2.5627777576446533, - "learning_rate": 5.145454545454546e-06, - "loss": 0.0593, - "step": 1336 - }, - { - "epoch": 24.30909090909091, - "grad_norm": 1.5003693103790283, - "learning_rate": 5.141818181818182e-06, - "loss": 0.0726, - "step": 1337 - }, - { - "epoch": 24.327272727272728, - "grad_norm": 3.021095037460327, - "learning_rate": 5.138181818181819e-06, - "loss": 0.0497, - "step": 1338 - }, - { - "epoch": 24.345454545454544, - "grad_norm": 0.847863495349884, - "learning_rate": 5.134545454545455e-06, - "loss": 0.0725, - "step": 1339 - }, - { - "epoch": 24.363636363636363, - "grad_norm": 2.1358959674835205, - "learning_rate": 5.130909090909091e-06, - "loss": 0.0609, - "step": 1340 - }, - { - "epoch": 24.381818181818183, - "grad_norm": 2.4435224533081055, - "learning_rate": 5.1272727272727275e-06, - "loss": 0.056, - "step": 1341 - }, - { - "epoch": 24.4, - "grad_norm": 1.462789535522461, - "learning_rate": 5.123636363636364e-06, - "loss": 0.0676, - "step": 1342 - }, - { - "epoch": 24.418181818181818, - "grad_norm": 1.0165674686431885, - "learning_rate": 5.12e-06, - "loss": 0.084, - "step": 1343 - }, - { - "epoch": 24.436363636363637, - "grad_norm": 2.018033742904663, - "learning_rate": 5.116363636363637e-06, - "loss": 0.0663, - "step": 1344 - }, - { - "epoch": 24.454545454545453, - "grad_norm": 3.429598569869995, - "learning_rate": 5.1127272727272735e-06, - "loss": 0.0375, - "step": 1345 - }, - { - "epoch": 24.472727272727273, - "grad_norm": 1.7991408109664917, - "learning_rate": 5.109090909090909e-06, - "loss": 0.0748, - "step": 1346 - }, - { - "epoch": 24.490909090909092, - "grad_norm": 0.4868219792842865, - "learning_rate": 5.105454545454546e-06, - "loss": 0.0631, - "step": 1347 - }, - { - "epoch": 24.509090909090908, - "grad_norm": 2.2529408931732178, - "learning_rate": 5.101818181818182e-06, - "loss": 0.0746, - "step": 1348 - }, - { - "epoch": 24.527272727272727, - "grad_norm": 2.907907247543335, - "learning_rate": 5.098181818181819e-06, - "loss": 0.0987, - "step": 1349 - }, - { - "epoch": 24.545454545454547, - "grad_norm": 0.5665605068206787, - "learning_rate": 5.094545454545455e-06, - "loss": 0.0471, - "step": 1350 - }, - { - "epoch": 24.563636363636363, - "grad_norm": 1.201952576637268, - "learning_rate": 5.090909090909091e-06, - "loss": 0.0452, - "step": 1351 - }, - { - "epoch": 24.581818181818182, - "grad_norm": 0.9155946373939514, - "learning_rate": 5.087272727272727e-06, - "loss": 0.0561, - "step": 1352 - }, - { - "epoch": 24.6, - "grad_norm": 1.591235876083374, - "learning_rate": 5.083636363636364e-06, - "loss": 0.0574, - "step": 1353 - }, - { - "epoch": 24.618181818181817, - "grad_norm": 0.33349260687828064, - "learning_rate": 5.0800000000000005e-06, - "loss": 0.0582, - "step": 1354 - }, - { - "epoch": 24.636363636363637, - "grad_norm": 0.7005792260169983, - "learning_rate": 5.076363636363637e-06, - "loss": 0.0738, - "step": 1355 - }, - { - "epoch": 24.654545454545456, - "grad_norm": 1.4143171310424805, - "learning_rate": 5.072727272727274e-06, - "loss": 0.0511, - "step": 1356 - }, - { - "epoch": 24.672727272727272, - "grad_norm": 3.2458724975585938, - "learning_rate": 5.069090909090909e-06, - "loss": 0.0502, - "step": 1357 - }, - { - "epoch": 24.69090909090909, - "grad_norm": 0.850993275642395, - "learning_rate": 5.065454545454546e-06, - "loss": 0.0614, - "step": 1358 - }, - { - "epoch": 24.70909090909091, - "grad_norm": 3.5982272624969482, - "learning_rate": 5.061818181818182e-06, - "loss": 0.047, - "step": 1359 - }, - { - "epoch": 24.727272727272727, - "grad_norm": 5.0198073387146, - "learning_rate": 5.058181818181819e-06, - "loss": 0.0723, - "step": 1360 - }, - { - "epoch": 24.745454545454546, - "grad_norm": 3.6935200691223145, - "learning_rate": 5.054545454545455e-06, - "loss": 0.0525, - "step": 1361 - }, - { - "epoch": 24.763636363636365, - "grad_norm": 2.1062357425689697, - "learning_rate": 5.050909090909092e-06, - "loss": 0.0647, - "step": 1362 - }, - { - "epoch": 24.78181818181818, - "grad_norm": 1.2077988386154175, - "learning_rate": 5.0472727272727275e-06, - "loss": 0.0527, - "step": 1363 - }, - { - "epoch": 24.8, - "grad_norm": 1.1347136497497559, - "learning_rate": 5.043636363636364e-06, - "loss": 0.0695, - "step": 1364 - }, - { - "epoch": 24.818181818181817, - "grad_norm": 1.4629991054534912, - "learning_rate": 5.04e-06, - "loss": 0.052, - "step": 1365 - }, - { - "epoch": 24.836363636363636, - "grad_norm": 0.8101198077201843, - "learning_rate": 5.036363636363637e-06, - "loss": 0.0634, - "step": 1366 - }, - { - "epoch": 24.854545454545455, - "grad_norm": 2.1875650882720947, - "learning_rate": 5.0327272727272734e-06, - "loss": 0.071, - "step": 1367 - }, - { - "epoch": 24.87272727272727, - "grad_norm": 2.924896478652954, - "learning_rate": 5.029090909090909e-06, - "loss": 0.0509, - "step": 1368 - }, - { - "epoch": 24.89090909090909, - "grad_norm": 2.8567912578582764, - "learning_rate": 5.025454545454546e-06, - "loss": 0.0557, - "step": 1369 - }, - { - "epoch": 24.90909090909091, - "grad_norm": 1.516141653060913, - "learning_rate": 5.021818181818182e-06, - "loss": 0.0663, - "step": 1370 - }, - { - "epoch": 24.927272727272726, - "grad_norm": 2.7637789249420166, - "learning_rate": 5.0181818181818186e-06, - "loss": 0.0642, - "step": 1371 - }, - { - "epoch": 24.945454545454545, - "grad_norm": 3.6185483932495117, - "learning_rate": 5.014545454545455e-06, - "loss": 0.0633, - "step": 1372 - }, - { - "epoch": 24.963636363636365, - "grad_norm": 4.298628807067871, - "learning_rate": 5.010909090909092e-06, - "loss": 0.0743, - "step": 1373 - }, - { - "epoch": 24.98181818181818, - "grad_norm": 2.7667593955993652, - "learning_rate": 5.007272727272727e-06, - "loss": 0.0543, - "step": 1374 - }, - { - "epoch": 25.0, - "grad_norm": 1.2051984071731567, - "learning_rate": 5.003636363636364e-06, - "loss": 0.0376, - "step": 1375 - }, - { - "epoch": 25.0, - "eval_loss": 0.06429881602525711, - "eval_runtime": 8.9571, - "eval_samples_per_second": 607.897, - "eval_steps_per_second": 76.029, - "step": 1375 - }, - { - "epoch": 25.01818181818182, - "grad_norm": 3.810518264770508, - "learning_rate": 5e-06, - "loss": 0.0673, - "step": 1376 - }, - { - "epoch": 25.036363636363635, - "grad_norm": 6.896632671356201, - "learning_rate": 4.996363636363637e-06, - "loss": 0.0798, - "step": 1377 - }, - { - "epoch": 25.054545454545455, - "grad_norm": 5.65014123916626, - "learning_rate": 4.992727272727273e-06, - "loss": 0.0608, - "step": 1378 - }, - { - "epoch": 25.072727272727274, - "grad_norm": 5.80860710144043, - "learning_rate": 4.98909090909091e-06, - "loss": 0.0708, - "step": 1379 - }, - { - "epoch": 25.09090909090909, - "grad_norm": 3.6611571311950684, - "learning_rate": 4.985454545454546e-06, - "loss": 0.0726, - "step": 1380 - }, - { - "epoch": 25.10909090909091, - "grad_norm": 0.6959943771362305, - "learning_rate": 4.981818181818182e-06, - "loss": 0.0515, - "step": 1381 - }, - { - "epoch": 25.12727272727273, - "grad_norm": 1.4009919166564941, - "learning_rate": 4.978181818181819e-06, - "loss": 0.0689, - "step": 1382 - }, - { - "epoch": 25.145454545454545, - "grad_norm": 2.7451059818267822, - "learning_rate": 4.974545454545455e-06, - "loss": 0.0746, - "step": 1383 - }, - { - "epoch": 25.163636363636364, - "grad_norm": 2.274423360824585, - "learning_rate": 4.9709090909090915e-06, - "loss": 0.0476, - "step": 1384 - }, - { - "epoch": 25.181818181818183, - "grad_norm": 0.6457282900810242, - "learning_rate": 4.967272727272727e-06, - "loss": 0.0515, - "step": 1385 - }, - { - "epoch": 25.2, - "grad_norm": 0.21675589680671692, - "learning_rate": 4.963636363636364e-06, - "loss": 0.0636, - "step": 1386 - }, - { - "epoch": 25.21818181818182, - "grad_norm": 3.589359760284424, - "learning_rate": 4.960000000000001e-06, - "loss": 0.0847, - "step": 1387 - }, - { - "epoch": 25.236363636363638, - "grad_norm": 1.5694563388824463, - "learning_rate": 4.956363636363637e-06, - "loss": 0.0622, - "step": 1388 - }, - { - "epoch": 25.254545454545454, - "grad_norm": 1.0725480318069458, - "learning_rate": 4.952727272727273e-06, - "loss": 0.0488, - "step": 1389 - }, - { - "epoch": 25.272727272727273, - "grad_norm": 0.7147088050842285, - "learning_rate": 4.949090909090909e-06, - "loss": 0.0594, - "step": 1390 - }, - { - "epoch": 25.29090909090909, - "grad_norm": 0.7958609461784363, - "learning_rate": 4.945454545454546e-06, - "loss": 0.0621, - "step": 1391 - }, - { - "epoch": 25.30909090909091, - "grad_norm": 1.6372411251068115, - "learning_rate": 4.941818181818182e-06, - "loss": 0.0559, - "step": 1392 - }, - { - "epoch": 25.327272727272728, - "grad_norm": 0.7440387606620789, - "learning_rate": 4.9381818181818185e-06, - "loss": 0.0528, - "step": 1393 - }, - { - "epoch": 25.345454545454544, - "grad_norm": 1.181160807609558, - "learning_rate": 4.934545454545455e-06, - "loss": 0.0671, - "step": 1394 - }, - { - "epoch": 25.363636363636363, - "grad_norm": 0.5839333534240723, - "learning_rate": 4.930909090909091e-06, - "loss": 0.0594, - "step": 1395 - }, - { - "epoch": 25.381818181818183, - "grad_norm": 2.7347826957702637, - "learning_rate": 4.927272727272728e-06, - "loss": 0.0568, - "step": 1396 - }, - { - "epoch": 25.4, - "grad_norm": 1.8288869857788086, - "learning_rate": 4.923636363636364e-06, - "loss": 0.0594, - "step": 1397 - }, - { - "epoch": 25.418181818181818, - "grad_norm": 1.0384562015533447, - "learning_rate": 4.92e-06, - "loss": 0.0918, - "step": 1398 - }, - { - "epoch": 25.436363636363637, - "grad_norm": 0.5704302787780762, - "learning_rate": 4.916363636363637e-06, - "loss": 0.0449, - "step": 1399 - }, - { - "epoch": 25.454545454545453, - "grad_norm": 0.5620180368423462, - "learning_rate": 4.912727272727273e-06, - "loss": 0.04, - "step": 1400 - }, - { - "epoch": 25.472727272727273, - "grad_norm": 0.4249431788921356, - "learning_rate": 4.90909090909091e-06, - "loss": 0.0521, - "step": 1401 - }, - { - "epoch": 25.490909090909092, - "grad_norm": 0.6128376126289368, - "learning_rate": 4.905454545454546e-06, - "loss": 0.0513, - "step": 1402 - }, - { - "epoch": 25.509090909090908, - "grad_norm": 0.27354612946510315, - "learning_rate": 4.901818181818182e-06, - "loss": 0.0503, - "step": 1403 - }, - { - "epoch": 25.527272727272727, - "grad_norm": 0.8266459107398987, - "learning_rate": 4.898181818181819e-06, - "loss": 0.0726, - "step": 1404 - }, - { - "epoch": 25.545454545454547, - "grad_norm": 0.6035958528518677, - "learning_rate": 4.894545454545455e-06, - "loss": 0.0558, - "step": 1405 - }, - { - "epoch": 25.563636363636363, - "grad_norm": 0.9351880550384521, - "learning_rate": 4.8909090909090914e-06, - "loss": 0.071, - "step": 1406 - }, - { - "epoch": 25.581818181818182, - "grad_norm": 1.1953450441360474, - "learning_rate": 4.887272727272728e-06, - "loss": 0.0772, - "step": 1407 - }, - { - "epoch": 25.6, - "grad_norm": 1.6315524578094482, - "learning_rate": 4.883636363636364e-06, - "loss": 0.0734, - "step": 1408 - }, - { - "epoch": 25.618181818181817, - "grad_norm": 0.8422393202781677, - "learning_rate": 4.880000000000001e-06, - "loss": 0.0533, - "step": 1409 - }, - { - "epoch": 25.636363636363637, - "grad_norm": 0.5037419199943542, - "learning_rate": 4.8763636363636366e-06, - "loss": 0.0525, - "step": 1410 - }, - { - "epoch": 25.654545454545456, - "grad_norm": 0.8241593241691589, - "learning_rate": 4.872727272727273e-06, - "loss": 0.0497, - "step": 1411 - }, - { - "epoch": 25.672727272727272, - "grad_norm": 2.22985577583313, - "learning_rate": 4.869090909090909e-06, - "loss": 0.0623, - "step": 1412 - }, - { - "epoch": 25.69090909090909, - "grad_norm": 0.5651208162307739, - "learning_rate": 4.865454545454546e-06, - "loss": 0.0714, - "step": 1413 - }, - { - "epoch": 25.70909090909091, - "grad_norm": 1.9103368520736694, - "learning_rate": 4.861818181818182e-06, - "loss": 0.0449, - "step": 1414 - }, - { - "epoch": 25.727272727272727, - "grad_norm": 1.3955978155136108, - "learning_rate": 4.858181818181818e-06, - "loss": 0.0538, - "step": 1415 - }, - { - "epoch": 25.745454545454546, - "grad_norm": 0.7979406714439392, - "learning_rate": 4.854545454545455e-06, - "loss": 0.0514, - "step": 1416 - }, - { - "epoch": 25.763636363636365, - "grad_norm": 1.1936836242675781, - "learning_rate": 4.850909090909091e-06, - "loss": 0.0611, - "step": 1417 - }, - { - "epoch": 25.78181818181818, - "grad_norm": 1.0550402402877808, - "learning_rate": 4.847272727272728e-06, - "loss": 0.0663, - "step": 1418 - }, - { - "epoch": 25.8, - "grad_norm": 1.7248601913452148, - "learning_rate": 4.843636363636364e-06, - "loss": 0.0678, - "step": 1419 - }, - { - "epoch": 25.818181818181817, - "grad_norm": 2.863394260406494, - "learning_rate": 4.84e-06, - "loss": 0.0411, - "step": 1420 - }, - { - "epoch": 25.836363636363636, - "grad_norm": 1.6211377382278442, - "learning_rate": 4.836363636363637e-06, - "loss": 0.0612, - "step": 1421 - }, - { - "epoch": 25.854545454545455, - "grad_norm": 0.637668788433075, - "learning_rate": 4.832727272727274e-06, - "loss": 0.0593, - "step": 1422 - }, - { - "epoch": 25.87272727272727, - "grad_norm": 0.5427751541137695, - "learning_rate": 4.8290909090909095e-06, - "loss": 0.0587, - "step": 1423 - }, - { - "epoch": 25.89090909090909, - "grad_norm": 0.8369677066802979, - "learning_rate": 4.825454545454546e-06, - "loss": 0.0505, - "step": 1424 - }, - { - "epoch": 25.90909090909091, - "grad_norm": 1.492172360420227, - "learning_rate": 4.821818181818182e-06, - "loss": 0.072, - "step": 1425 - }, - { - "epoch": 25.927272727272726, - "grad_norm": 1.335091471672058, - "learning_rate": 4.818181818181819e-06, - "loss": 0.0617, - "step": 1426 - }, - { - "epoch": 25.945454545454545, - "grad_norm": 1.2640856504440308, - "learning_rate": 4.8145454545454555e-06, - "loss": 0.0457, - "step": 1427 - }, - { - "epoch": 25.963636363636365, - "grad_norm": 2.333787679672241, - "learning_rate": 4.810909090909091e-06, - "loss": 0.0759, - "step": 1428 - }, - { - "epoch": 25.98181818181818, - "grad_norm": 1.1562212705612183, - "learning_rate": 4.807272727272728e-06, - "loss": 0.0555, - "step": 1429 - }, - { - "epoch": 26.0, - "grad_norm": 0.44830435514450073, - "learning_rate": 4.803636363636364e-06, - "loss": 0.0633, - "step": 1430 - }, - { - "epoch": 26.0, - "eval_loss": 0.05759671330451965, - "eval_runtime": 9.4438, - "eval_samples_per_second": 576.568, - "eval_steps_per_second": 72.111, - "step": 1430 - }, - { - "epoch": 26.01818181818182, - "grad_norm": 2.1837453842163086, - "learning_rate": 4.800000000000001e-06, - "loss": 0.0534, - "step": 1431 - }, - { - "epoch": 26.036363636363635, - "grad_norm": 1.4666674137115479, - "learning_rate": 4.7963636363636365e-06, - "loss": 0.0619, - "step": 1432 - }, - { - "epoch": 26.054545454545455, - "grad_norm": 0.26811501383781433, - "learning_rate": 4.792727272727273e-06, - "loss": 0.0517, - "step": 1433 - }, - { - "epoch": 26.072727272727274, - "grad_norm": 1.5204460620880127, - "learning_rate": 4.789090909090909e-06, - "loss": 0.0703, - "step": 1434 - }, - { - "epoch": 26.09090909090909, - "grad_norm": 0.7842665314674377, - "learning_rate": 4.785454545454546e-06, - "loss": 0.0783, - "step": 1435 - }, - { - "epoch": 26.10909090909091, - "grad_norm": 0.6893973350524902, - "learning_rate": 4.7818181818181825e-06, - "loss": 0.0666, - "step": 1436 - }, - { - "epoch": 26.12727272727273, - "grad_norm": 0.7614837884902954, - "learning_rate": 4.778181818181818e-06, - "loss": 0.0623, - "step": 1437 - }, - { - "epoch": 26.145454545454545, - "grad_norm": 2.2757468223571777, - "learning_rate": 4.774545454545455e-06, - "loss": 0.0584, - "step": 1438 - }, - { - "epoch": 26.163636363636364, - "grad_norm": 1.7642618417739868, - "learning_rate": 4.770909090909091e-06, - "loss": 0.062, - "step": 1439 - }, - { - "epoch": 26.181818181818183, - "grad_norm": 0.24083095788955688, - "learning_rate": 4.767272727272728e-06, - "loss": 0.0591, - "step": 1440 - }, - { - "epoch": 26.2, - "grad_norm": 0.7866027355194092, - "learning_rate": 4.763636363636364e-06, - "loss": 0.0687, - "step": 1441 - }, - { - "epoch": 26.21818181818182, - "grad_norm": 1.478537917137146, - "learning_rate": 4.76e-06, - "loss": 0.0508, - "step": 1442 - }, - { - "epoch": 26.236363636363638, - "grad_norm": 1.1844738721847534, - "learning_rate": 4.756363636363637e-06, - "loss": 0.0718, - "step": 1443 - }, - { - "epoch": 26.254545454545454, - "grad_norm": 0.8181471228599548, - "learning_rate": 4.752727272727274e-06, - "loss": 0.0673, - "step": 1444 - }, - { - "epoch": 26.272727272727273, - "grad_norm": 1.5538526773452759, - "learning_rate": 4.7490909090909094e-06, - "loss": 0.0659, - "step": 1445 - }, - { - "epoch": 26.29090909090909, - "grad_norm": 0.41355040669441223, - "learning_rate": 4.745454545454546e-06, - "loss": 0.0709, - "step": 1446 - }, - { - "epoch": 26.30909090909091, - "grad_norm": 1.6226203441619873, - "learning_rate": 4.741818181818183e-06, - "loss": 0.0647, - "step": 1447 - }, - { - "epoch": 26.327272727272728, - "grad_norm": 0.3639097213745117, - "learning_rate": 4.738181818181819e-06, - "loss": 0.0584, - "step": 1448 - }, - { - "epoch": 26.345454545454544, - "grad_norm": 1.9045621156692505, - "learning_rate": 4.734545454545455e-06, - "loss": 0.0584, - "step": 1449 - }, - { - "epoch": 26.363636363636363, - "grad_norm": 1.3145970106124878, - "learning_rate": 4.730909090909091e-06, - "loss": 0.0703, - "step": 1450 - }, - { - "epoch": 26.381818181818183, - "grad_norm": 1.609418511390686, - "learning_rate": 4.727272727272728e-06, - "loss": 0.0652, - "step": 1451 - }, - { - "epoch": 26.4, - "grad_norm": 0.8582628965377808, - "learning_rate": 4.723636363636364e-06, - "loss": 0.0704, - "step": 1452 - }, - { - "epoch": 26.418181818181818, - "grad_norm": 2.5201618671417236, - "learning_rate": 4.7200000000000005e-06, - "loss": 0.0728, - "step": 1453 - }, - { - "epoch": 26.436363636363637, - "grad_norm": 5.782715797424316, - "learning_rate": 4.716363636363636e-06, - "loss": 0.0679, - "step": 1454 - }, - { - "epoch": 26.454545454545453, - "grad_norm": 2.3802108764648438, - "learning_rate": 4.712727272727273e-06, - "loss": 0.0519, - "step": 1455 - }, - { - "epoch": 26.472727272727273, - "grad_norm": 0.7594008445739746, - "learning_rate": 4.709090909090909e-06, - "loss": 0.0653, - "step": 1456 - }, - { - "epoch": 26.490909090909092, - "grad_norm": 1.4692062139511108, - "learning_rate": 4.705454545454546e-06, - "loss": 0.0488, - "step": 1457 - }, - { - "epoch": 26.509090909090908, - "grad_norm": 1.6307895183563232, - "learning_rate": 4.701818181818182e-06, - "loss": 0.0585, - "step": 1458 - }, - { - "epoch": 26.527272727272727, - "grad_norm": 0.6319008469581604, - "learning_rate": 4.698181818181818e-06, - "loss": 0.0628, - "step": 1459 - }, - { - "epoch": 26.545454545454547, - "grad_norm": 0.49932360649108887, - "learning_rate": 4.694545454545455e-06, - "loss": 0.0467, - "step": 1460 - }, - { - "epoch": 26.563636363636363, - "grad_norm": 0.21811555325984955, - "learning_rate": 4.690909090909092e-06, - "loss": 0.0554, - "step": 1461 - }, - { - "epoch": 26.581818181818182, - "grad_norm": 2.3673431873321533, - "learning_rate": 4.6872727272727275e-06, - "loss": 0.0603, - "step": 1462 - }, - { - "epoch": 26.6, - "grad_norm": 0.8804445266723633, - "learning_rate": 4.683636363636364e-06, - "loss": 0.0517, - "step": 1463 - }, - { - "epoch": 26.618181818181817, - "grad_norm": 0.6105267405509949, - "learning_rate": 4.680000000000001e-06, - "loss": 0.0698, - "step": 1464 - }, - { - "epoch": 26.636363636363637, - "grad_norm": 0.6658057570457458, - "learning_rate": 4.676363636363637e-06, - "loss": 0.0645, - "step": 1465 - }, - { - "epoch": 26.654545454545456, - "grad_norm": 0.6907119154930115, - "learning_rate": 4.6727272727272735e-06, - "loss": 0.0464, - "step": 1466 - }, - { - "epoch": 26.672727272727272, - "grad_norm": 2.5398380756378174, - "learning_rate": 4.669090909090909e-06, - "loss": 0.0618, - "step": 1467 - }, - { - "epoch": 26.69090909090909, - "grad_norm": 0.9803459644317627, - "learning_rate": 4.665454545454546e-06, - "loss": 0.0555, - "step": 1468 - }, - { - "epoch": 26.70909090909091, - "grad_norm": 1.0524306297302246, - "learning_rate": 4.661818181818183e-06, - "loss": 0.0645, - "step": 1469 - }, - { - "epoch": 26.727272727272727, - "grad_norm": 1.5428229570388794, - "learning_rate": 4.658181818181819e-06, - "loss": 0.0388, - "step": 1470 - }, - { - "epoch": 26.745454545454546, - "grad_norm": 0.5843687653541565, - "learning_rate": 4.654545454545455e-06, - "loss": 0.0556, - "step": 1471 - }, - { - "epoch": 26.763636363636365, - "grad_norm": 0.9906989336013794, - "learning_rate": 4.650909090909091e-06, - "loss": 0.0434, - "step": 1472 - }, - { - "epoch": 26.78181818181818, - "grad_norm": 2.0034444332122803, - "learning_rate": 4.647272727272728e-06, - "loss": 0.0611, - "step": 1473 - }, - { - "epoch": 26.8, - "grad_norm": 1.117156744003296, - "learning_rate": 4.643636363636364e-06, - "loss": 0.055, - "step": 1474 - }, - { - "epoch": 26.818181818181817, - "grad_norm": 1.2927886247634888, - "learning_rate": 4.6400000000000005e-06, - "loss": 0.0579, - "step": 1475 - }, - { - "epoch": 26.836363636363636, - "grad_norm": 1.3539191484451294, - "learning_rate": 4.636363636363636e-06, - "loss": 0.0683, - "step": 1476 - }, - { - "epoch": 26.854545454545455, - "grad_norm": 2.092249870300293, - "learning_rate": 4.632727272727273e-06, - "loss": 0.0624, - "step": 1477 - }, - { - "epoch": 26.87272727272727, - "grad_norm": 2.4204671382904053, - "learning_rate": 4.62909090909091e-06, - "loss": 0.0661, - "step": 1478 - }, - { - "epoch": 26.89090909090909, - "grad_norm": 3.6876702308654785, - "learning_rate": 4.625454545454546e-06, - "loss": 0.0551, - "step": 1479 - }, - { - "epoch": 26.90909090909091, - "grad_norm": 1.629820466041565, - "learning_rate": 4.621818181818182e-06, - "loss": 0.0543, - "step": 1480 - }, - { - "epoch": 26.927272727272726, - "grad_norm": 2.6789400577545166, - "learning_rate": 4.618181818181818e-06, - "loss": 0.0617, - "step": 1481 - }, - { - "epoch": 26.945454545454545, - "grad_norm": 4.366017818450928, - "learning_rate": 4.614545454545455e-06, - "loss": 0.0715, - "step": 1482 - }, - { - "epoch": 26.963636363636365, - "grad_norm": 2.629032611846924, - "learning_rate": 4.610909090909092e-06, - "loss": 0.0473, - "step": 1483 - }, - { - "epoch": 26.98181818181818, - "grad_norm": 1.6147429943084717, - "learning_rate": 4.6072727272727274e-06, - "loss": 0.046, - "step": 1484 - }, - { - "epoch": 27.0, - "grad_norm": 0.5623118877410889, - "learning_rate": 4.603636363636364e-06, - "loss": 0.0548, - "step": 1485 - }, - { - "epoch": 27.0, - "eval_loss": 0.05810500308871269, - "eval_runtime": 8.9848, - "eval_samples_per_second": 606.022, - "eval_steps_per_second": 75.795, - "step": 1485 - }, - { - "epoch": 27.01818181818182, - "grad_norm": 2.653932809829712, - "learning_rate": 4.600000000000001e-06, - "loss": 0.0418, - "step": 1486 - }, - { - "epoch": 27.036363636363635, - "grad_norm": 1.6641305685043335, - "learning_rate": 4.596363636363637e-06, - "loss": 0.0653, - "step": 1487 - }, - { - "epoch": 27.054545454545455, - "grad_norm": 0.8900250792503357, - "learning_rate": 4.592727272727273e-06, - "loss": 0.0708, - "step": 1488 - }, - { - "epoch": 27.072727272727274, - "grad_norm": 0.6793739795684814, - "learning_rate": 4.58909090909091e-06, - "loss": 0.0481, - "step": 1489 - }, - { - "epoch": 27.09090909090909, - "grad_norm": 2.4787440299987793, - "learning_rate": 4.585454545454546e-06, - "loss": 0.0608, - "step": 1490 - }, - { - "epoch": 27.10909090909091, - "grad_norm": 2.534809112548828, - "learning_rate": 4.581818181818183e-06, - "loss": 0.0654, - "step": 1491 - }, - { - "epoch": 27.12727272727273, - "grad_norm": 0.7251519560813904, - "learning_rate": 4.5781818181818185e-06, - "loss": 0.0583, - "step": 1492 - }, - { - "epoch": 27.145454545454545, - "grad_norm": 0.8400058150291443, - "learning_rate": 4.574545454545455e-06, - "loss": 0.0789, - "step": 1493 - }, - { - "epoch": 27.163636363636364, - "grad_norm": 0.9011039733886719, - "learning_rate": 4.570909090909091e-06, - "loss": 0.0644, - "step": 1494 - }, - { - "epoch": 27.181818181818183, - "grad_norm": 2.1381654739379883, - "learning_rate": 4.567272727272728e-06, - "loss": 0.0526, - "step": 1495 - }, - { - "epoch": 27.2, - "grad_norm": 1.2775230407714844, - "learning_rate": 4.563636363636364e-06, - "loss": 0.0592, - "step": 1496 - }, - { - "epoch": 27.21818181818182, - "grad_norm": 0.9208635091781616, - "learning_rate": 4.56e-06, - "loss": 0.0708, - "step": 1497 - }, - { - "epoch": 27.236363636363638, - "grad_norm": 2.0347137451171875, - "learning_rate": 4.556363636363636e-06, - "loss": 0.055, - "step": 1498 - }, - { - "epoch": 27.254545454545454, - "grad_norm": 0.47041353583335876, - "learning_rate": 4.552727272727273e-06, - "loss": 0.0681, - "step": 1499 - }, - { - "epoch": 27.272727272727273, - "grad_norm": 1.5622327327728271, - "learning_rate": 4.54909090909091e-06, - "loss": 0.0828, - "step": 1500 - }, - { - "epoch": 27.29090909090909, - "grad_norm": 1.2213224172592163, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.0558, - "step": 1501 - }, - { - "epoch": 27.30909090909091, - "grad_norm": 2.430955648422241, - "learning_rate": 4.541818181818182e-06, - "loss": 0.0518, - "step": 1502 - }, - { - "epoch": 27.327272727272728, - "grad_norm": 0.7928164005279541, - "learning_rate": 4.538181818181819e-06, - "loss": 0.0568, - "step": 1503 - }, - { - "epoch": 27.345454545454544, - "grad_norm": 0.3439847230911255, - "learning_rate": 4.534545454545455e-06, - "loss": 0.0565, - "step": 1504 - }, - { - "epoch": 27.363636363636363, - "grad_norm": 2.279726266860962, - "learning_rate": 4.5309090909090915e-06, - "loss": 0.0522, - "step": 1505 - }, - { - "epoch": 27.381818181818183, - "grad_norm": 2.5572798252105713, - "learning_rate": 4.527272727272727e-06, - "loss": 0.0636, - "step": 1506 - }, - { - "epoch": 27.4, - "grad_norm": 0.799588143825531, - "learning_rate": 4.523636363636364e-06, - "loss": 0.058, - "step": 1507 - }, - { - "epoch": 27.418181818181818, - "grad_norm": 1.9768623113632202, - "learning_rate": 4.520000000000001e-06, - "loss": 0.061, - "step": 1508 - }, - { - "epoch": 27.436363636363637, - "grad_norm": 1.6250628232955933, - "learning_rate": 4.516363636363637e-06, - "loss": 0.0615, - "step": 1509 - }, - { - "epoch": 27.454545454545453, - "grad_norm": 1.8978708982467651, - "learning_rate": 4.512727272727273e-06, - "loss": 0.0678, - "step": 1510 - }, - { - "epoch": 27.472727272727273, - "grad_norm": 0.5915879011154175, - "learning_rate": 4.50909090909091e-06, - "loss": 0.0472, - "step": 1511 - }, - { - "epoch": 27.490909090909092, - "grad_norm": 1.525000810623169, - "learning_rate": 4.505454545454546e-06, - "loss": 0.0614, - "step": 1512 - }, - { - "epoch": 27.509090909090908, - "grad_norm": 2.7891650199890137, - "learning_rate": 4.501818181818183e-06, - "loss": 0.0617, - "step": 1513 - }, - { - "epoch": 27.527272727272727, - "grad_norm": 1.2578272819519043, - "learning_rate": 4.4981818181818185e-06, - "loss": 0.061, - "step": 1514 - }, - { - "epoch": 27.545454545454547, - "grad_norm": 1.2315397262573242, - "learning_rate": 4.494545454545455e-06, - "loss": 0.0565, - "step": 1515 - }, - { - "epoch": 27.563636363636363, - "grad_norm": 1.0684926509857178, - "learning_rate": 4.490909090909091e-06, - "loss": 0.072, - "step": 1516 - }, - { - "epoch": 27.581818181818182, - "grad_norm": 0.7852941155433655, - "learning_rate": 4.487272727272728e-06, - "loss": 0.0597, - "step": 1517 - }, - { - "epoch": 27.6, - "grad_norm": 1.2785192728042603, - "learning_rate": 4.483636363636364e-06, - "loss": 0.054, - "step": 1518 - }, - { - "epoch": 27.618181818181817, - "grad_norm": 0.6447552442550659, - "learning_rate": 4.48e-06, - "loss": 0.0507, - "step": 1519 - }, - { - "epoch": 27.636363636363637, - "grad_norm": 0.8637734055519104, - "learning_rate": 4.476363636363636e-06, - "loss": 0.0707, - "step": 1520 - }, - { - "epoch": 27.654545454545456, - "grad_norm": 1.197811484336853, - "learning_rate": 4.472727272727273e-06, - "loss": 0.0579, - "step": 1521 - }, - { - "epoch": 27.672727272727272, - "grad_norm": 0.7972061634063721, - "learning_rate": 4.46909090909091e-06, - "loss": 0.0705, - "step": 1522 - }, - { - "epoch": 27.69090909090909, - "grad_norm": 1.0290143489837646, - "learning_rate": 4.4654545454545454e-06, - "loss": 0.063, - "step": 1523 - }, - { - "epoch": 27.70909090909091, - "grad_norm": 0.21109052002429962, - "learning_rate": 4.461818181818182e-06, - "loss": 0.0547, - "step": 1524 - }, - { - "epoch": 27.727272727272727, - "grad_norm": 0.8191462159156799, - "learning_rate": 4.458181818181819e-06, - "loss": 0.0556, - "step": 1525 - }, - { - "epoch": 27.745454545454546, - "grad_norm": 0.3015569746494293, - "learning_rate": 4.454545454545455e-06, - "loss": 0.0681, - "step": 1526 - }, - { - "epoch": 27.763636363636365, - "grad_norm": 1.1738694906234741, - "learning_rate": 4.450909090909091e-06, - "loss": 0.0718, - "step": 1527 - }, - { - "epoch": 27.78181818181818, - "grad_norm": 1.0933278799057007, - "learning_rate": 4.447272727272728e-06, - "loss": 0.0523, - "step": 1528 - }, - { - "epoch": 27.8, - "grad_norm": 0.8725619316101074, - "learning_rate": 4.443636363636364e-06, - "loss": 0.0658, - "step": 1529 - }, - { - "epoch": 27.818181818181817, - "grad_norm": 0.39844605326652527, - "learning_rate": 4.440000000000001e-06, - "loss": 0.0596, - "step": 1530 - }, - { - "epoch": 27.836363636363636, - "grad_norm": 1.1306300163269043, - "learning_rate": 4.436363636363637e-06, - "loss": 0.0546, - "step": 1531 - }, - { - "epoch": 27.854545454545455, - "grad_norm": 1.9147512912750244, - "learning_rate": 4.432727272727273e-06, - "loss": 0.0801, - "step": 1532 - }, - { - "epoch": 27.87272727272727, - "grad_norm": 0.4513094425201416, - "learning_rate": 4.42909090909091e-06, - "loss": 0.0484, - "step": 1533 - }, - { - "epoch": 27.89090909090909, - "grad_norm": 0.6857220530509949, - "learning_rate": 4.425454545454546e-06, - "loss": 0.0722, - "step": 1534 - }, - { - "epoch": 27.90909090909091, - "grad_norm": 0.5064235925674438, - "learning_rate": 4.4218181818181825e-06, - "loss": 0.0565, - "step": 1535 - }, - { - "epoch": 27.927272727272726, - "grad_norm": 0.3328354060649872, - "learning_rate": 4.418181818181818e-06, - "loss": 0.0314, - "step": 1536 - }, - { - "epoch": 27.945454545454545, - "grad_norm": 0.6346264481544495, - "learning_rate": 4.414545454545455e-06, - "loss": 0.0521, - "step": 1537 - }, - { - "epoch": 27.963636363636365, - "grad_norm": 0.6510246396064758, - "learning_rate": 4.410909090909091e-06, - "loss": 0.0708, - "step": 1538 - }, - { - "epoch": 27.98181818181818, - "grad_norm": 0.6075253486633301, - "learning_rate": 4.407272727272728e-06, - "loss": 0.0454, - "step": 1539 - }, - { - "epoch": 28.0, - "grad_norm": 1.3129624128341675, - "learning_rate": 4.4036363636363635e-06, - "loss": 0.064, - "step": 1540 - }, - { - "epoch": 28.0, - "eval_loss": 0.057568930089473724, - "eval_runtime": 8.4672, - "eval_samples_per_second": 643.066, - "eval_steps_per_second": 80.428, - "step": 1540 - }, - { - "epoch": 28.01818181818182, - "grad_norm": 1.3181462287902832, - "learning_rate": 4.4e-06, - "loss": 0.0488, - "step": 1541 - }, - { - "epoch": 28.036363636363635, - "grad_norm": 1.9218584299087524, - "learning_rate": 4.396363636363637e-06, - "loss": 0.0643, - "step": 1542 - }, - { - "epoch": 28.054545454545455, - "grad_norm": 0.5385881066322327, - "learning_rate": 4.392727272727273e-06, - "loss": 0.0699, - "step": 1543 - }, - { - "epoch": 28.072727272727274, - "grad_norm": 2.366641044616699, - "learning_rate": 4.3890909090909095e-06, - "loss": 0.0435, - "step": 1544 - }, - { - "epoch": 28.09090909090909, - "grad_norm": 1.7335325479507446, - "learning_rate": 4.385454545454546e-06, - "loss": 0.0557, - "step": 1545 - }, - { - "epoch": 28.10909090909091, - "grad_norm": 1.1030776500701904, - "learning_rate": 4.381818181818182e-06, - "loss": 0.0591, - "step": 1546 - }, - { - "epoch": 28.12727272727273, - "grad_norm": 1.2836047410964966, - "learning_rate": 4.378181818181819e-06, - "loss": 0.0666, - "step": 1547 - }, - { - "epoch": 28.145454545454545, - "grad_norm": 1.6754839420318604, - "learning_rate": 4.374545454545455e-06, - "loss": 0.0756, - "step": 1548 - }, - { - "epoch": 28.163636363636364, - "grad_norm": 0.22959715127944946, - "learning_rate": 4.370909090909091e-06, - "loss": 0.0684, - "step": 1549 - }, - { - "epoch": 28.181818181818183, - "grad_norm": 0.17080801725387573, - "learning_rate": 4.367272727272728e-06, - "loss": 0.0679, - "step": 1550 - }, - { - "epoch": 28.2, - "grad_norm": 0.49473705887794495, - "learning_rate": 4.363636363636364e-06, - "loss": 0.0673, - "step": 1551 - }, - { - "epoch": 28.21818181818182, - "grad_norm": 1.106116771697998, - "learning_rate": 4.360000000000001e-06, - "loss": 0.0688, - "step": 1552 - }, - { - "epoch": 28.236363636363638, - "grad_norm": 0.4661824107170105, - "learning_rate": 4.356363636363637e-06, - "loss": 0.0574, - "step": 1553 - }, - { - "epoch": 28.254545454545454, - "grad_norm": 0.40917161107063293, - "learning_rate": 4.352727272727273e-06, - "loss": 0.0583, - "step": 1554 - }, - { - "epoch": 28.272727272727273, - "grad_norm": 0.5590949654579163, - "learning_rate": 4.34909090909091e-06, - "loss": 0.0542, - "step": 1555 - }, - { - "epoch": 28.29090909090909, - "grad_norm": 0.4187358021736145, - "learning_rate": 4.345454545454546e-06, - "loss": 0.0528, - "step": 1556 - }, - { - "epoch": 28.30909090909091, - "grad_norm": 0.5342415571212769, - "learning_rate": 4.3418181818181824e-06, - "loss": 0.0662, - "step": 1557 - }, - { - "epoch": 28.327272727272728, - "grad_norm": 1.4247816801071167, - "learning_rate": 4.338181818181818e-06, - "loss": 0.061, - "step": 1558 - }, - { - "epoch": 28.345454545454544, - "grad_norm": 1.2403727769851685, - "learning_rate": 4.334545454545455e-06, - "loss": 0.0583, - "step": 1559 - }, - { - "epoch": 28.363636363636363, - "grad_norm": 0.700584352016449, - "learning_rate": 4.330909090909091e-06, - "loss": 0.0624, - "step": 1560 - }, - { - "epoch": 28.381818181818183, - "grad_norm": 2.1429965496063232, - "learning_rate": 4.327272727272728e-06, - "loss": 0.0553, - "step": 1561 - }, - { - "epoch": 28.4, - "grad_norm": 1.7677078247070312, - "learning_rate": 4.3236363636363634e-06, - "loss": 0.0797, - "step": 1562 - }, - { - "epoch": 28.418181818181818, - "grad_norm": 1.9213759899139404, - "learning_rate": 4.32e-06, - "loss": 0.0569, - "step": 1563 - }, - { - "epoch": 28.436363636363637, - "grad_norm": 2.7947256565093994, - "learning_rate": 4.316363636363637e-06, - "loss": 0.0453, - "step": 1564 - }, - { - "epoch": 28.454545454545453, - "grad_norm": 3.244438409805298, - "learning_rate": 4.312727272727273e-06, - "loss": 0.0611, - "step": 1565 - }, - { - "epoch": 28.472727272727273, - "grad_norm": 0.6884074807167053, - "learning_rate": 4.309090909090909e-06, - "loss": 0.0532, - "step": 1566 - }, - { - "epoch": 28.490909090909092, - "grad_norm": 0.4944112002849579, - "learning_rate": 4.305454545454546e-06, - "loss": 0.0632, - "step": 1567 - }, - { - "epoch": 28.509090909090908, - "grad_norm": 0.2813641130924225, - "learning_rate": 4.301818181818182e-06, - "loss": 0.053, - "step": 1568 - }, - { - "epoch": 28.527272727272727, - "grad_norm": 2.601086139678955, - "learning_rate": 4.298181818181819e-06, - "loss": 0.0756, - "step": 1569 - }, - { - "epoch": 28.545454545454547, - "grad_norm": 0.6055359840393066, - "learning_rate": 4.294545454545455e-06, - "loss": 0.0688, - "step": 1570 - }, - { - "epoch": 28.563636363636363, - "grad_norm": 0.6076827645301819, - "learning_rate": 4.290909090909091e-06, - "loss": 0.0569, - "step": 1571 - }, - { - "epoch": 28.581818181818182, - "grad_norm": 1.4285460710525513, - "learning_rate": 4.287272727272728e-06, - "loss": 0.0524, - "step": 1572 - }, - { - "epoch": 28.6, - "grad_norm": 1.5827564001083374, - "learning_rate": 4.283636363636365e-06, - "loss": 0.0518, - "step": 1573 - }, - { - "epoch": 28.618181818181817, - "grad_norm": 0.48734042048454285, - "learning_rate": 4.2800000000000005e-06, - "loss": 0.0565, - "step": 1574 - }, - { - "epoch": 28.636363636363637, - "grad_norm": 2.252662181854248, - "learning_rate": 4.276363636363637e-06, - "loss": 0.0631, - "step": 1575 - }, - { - "epoch": 28.654545454545456, - "grad_norm": 3.946492910385132, - "learning_rate": 4.272727272727273e-06, - "loss": 0.0709, - "step": 1576 - }, - { - "epoch": 28.672727272727272, - "grad_norm": 1.9942643642425537, - "learning_rate": 4.26909090909091e-06, - "loss": 0.0627, - "step": 1577 - }, - { - "epoch": 28.69090909090909, - "grad_norm": 1.0574123859405518, - "learning_rate": 4.265454545454546e-06, - "loss": 0.0379, - "step": 1578 - }, - { - "epoch": 28.70909090909091, - "grad_norm": 2.2671732902526855, - "learning_rate": 4.261818181818182e-06, - "loss": 0.0676, - "step": 1579 - }, - { - "epoch": 28.727272727272727, - "grad_norm": 3.606518268585205, - "learning_rate": 4.258181818181818e-06, - "loss": 0.0617, - "step": 1580 - }, - { - "epoch": 28.745454545454546, - "grad_norm": 3.94199538230896, - "learning_rate": 4.254545454545455e-06, - "loss": 0.0548, - "step": 1581 - }, - { - "epoch": 28.763636363636365, - "grad_norm": 2.303316831588745, - "learning_rate": 4.250909090909091e-06, - "loss": 0.0586, - "step": 1582 - }, - { - "epoch": 28.78181818181818, - "grad_norm": 0.2363220453262329, - "learning_rate": 4.2472727272727275e-06, - "loss": 0.0534, - "step": 1583 - }, - { - "epoch": 28.8, - "grad_norm": 2.3605735301971436, - "learning_rate": 4.243636363636364e-06, - "loss": 0.0609, - "step": 1584 - }, - { - "epoch": 28.818181818181817, - "grad_norm": 2.747548818588257, - "learning_rate": 4.24e-06, - "loss": 0.0508, - "step": 1585 - }, - { - "epoch": 28.836363636363636, - "grad_norm": 1.585766315460205, - "learning_rate": 4.236363636363637e-06, - "loss": 0.0633, - "step": 1586 - }, - { - "epoch": 28.854545454545455, - "grad_norm": 0.7514519095420837, - "learning_rate": 4.2327272727272735e-06, - "loss": 0.0477, - "step": 1587 - }, - { - "epoch": 28.87272727272727, - "grad_norm": 1.4636980295181274, - "learning_rate": 4.229090909090909e-06, - "loss": 0.0531, - "step": 1588 - }, - { - "epoch": 28.89090909090909, - "grad_norm": 1.3001970052719116, - "learning_rate": 4.225454545454546e-06, - "loss": 0.0644, - "step": 1589 - }, - { - "epoch": 28.90909090909091, - "grad_norm": 0.5336251258850098, - "learning_rate": 4.221818181818182e-06, - "loss": 0.0562, - "step": 1590 - }, - { - "epoch": 28.927272727272726, - "grad_norm": 0.34912365674972534, - "learning_rate": 4.218181818181819e-06, - "loss": 0.0632, - "step": 1591 - }, - { - "epoch": 28.945454545454545, - "grad_norm": 0.6443033814430237, - "learning_rate": 4.214545454545455e-06, - "loss": 0.0421, - "step": 1592 - }, - { - "epoch": 28.963636363636365, - "grad_norm": 2.681828260421753, - "learning_rate": 4.210909090909091e-06, - "loss": 0.1062, - "step": 1593 - }, - { - "epoch": 28.98181818181818, - "grad_norm": 0.7467508912086487, - "learning_rate": 4.207272727272728e-06, - "loss": 0.0495, - "step": 1594 - }, - { - "epoch": 29.0, - "grad_norm": 0.5810105800628662, - "learning_rate": 4.203636363636365e-06, - "loss": 0.0497, - "step": 1595 - }, - { - "epoch": 29.0, - "eval_loss": 0.0572790801525116, - "eval_runtime": 9.4585, - "eval_samples_per_second": 575.672, - "eval_steps_per_second": 71.999, - "step": 1595 - }, - { - "epoch": 29.01818181818182, - "grad_norm": 0.6809083223342896, - "learning_rate": 4.2000000000000004e-06, - "loss": 0.0574, - "step": 1596 - }, - { - "epoch": 29.036363636363635, - "grad_norm": 2.556205987930298, - "learning_rate": 4.196363636363637e-06, - "loss": 0.0557, - "step": 1597 - }, - { - "epoch": 29.054545454545455, - "grad_norm": 0.7150790691375732, - "learning_rate": 4.192727272727273e-06, - "loss": 0.065, - "step": 1598 - }, - { - "epoch": 29.072727272727274, - "grad_norm": 0.2575477957725525, - "learning_rate": 4.18909090909091e-06, - "loss": 0.0712, - "step": 1599 - }, - { - "epoch": 29.09090909090909, - "grad_norm": 0.7881742715835571, - "learning_rate": 4.185454545454546e-06, - "loss": 0.0427, - "step": 1600 - }, - { - "epoch": 29.10909090909091, - "grad_norm": 1.4235594272613525, - "learning_rate": 4.181818181818182e-06, - "loss": 0.0589, - "step": 1601 - }, - { - "epoch": 29.12727272727273, - "grad_norm": 0.47386008501052856, - "learning_rate": 4.178181818181818e-06, - "loss": 0.0446, - "step": 1602 - }, - { - "epoch": 29.145454545454545, - "grad_norm": 0.5268478393554688, - "learning_rate": 4.174545454545455e-06, - "loss": 0.0533, - "step": 1603 - }, - { - "epoch": 29.163636363636364, - "grad_norm": 1.2295212745666504, - "learning_rate": 4.170909090909091e-06, - "loss": 0.0794, - "step": 1604 - }, - { - "epoch": 29.181818181818183, - "grad_norm": 0.6470080614089966, - "learning_rate": 4.167272727272727e-06, - "loss": 0.0605, - "step": 1605 - }, - { - "epoch": 29.2, - "grad_norm": 0.45043566823005676, - "learning_rate": 4.163636363636364e-06, - "loss": 0.0413, - "step": 1606 - }, - { - "epoch": 29.21818181818182, - "grad_norm": 2.2870190143585205, - "learning_rate": 4.16e-06, - "loss": 0.0481, - "step": 1607 - }, - { - "epoch": 29.236363636363638, - "grad_norm": 0.38093239068984985, - "learning_rate": 4.156363636363637e-06, - "loss": 0.0591, - "step": 1608 - }, - { - "epoch": 29.254545454545454, - "grad_norm": 1.8167623281478882, - "learning_rate": 4.152727272727273e-06, - "loss": 0.0569, - "step": 1609 - }, - { - "epoch": 29.272727272727273, - "grad_norm": 0.35108816623687744, - "learning_rate": 4.149090909090909e-06, - "loss": 0.0619, - "step": 1610 - }, - { - "epoch": 29.29090909090909, - "grad_norm": 1.1312860250473022, - "learning_rate": 4.145454545454546e-06, - "loss": 0.0692, - "step": 1611 - }, - { - "epoch": 29.30909090909091, - "grad_norm": 2.5690908432006836, - "learning_rate": 4.141818181818183e-06, - "loss": 0.0623, - "step": 1612 - }, - { - "epoch": 29.327272727272728, - "grad_norm": 0.9077398777008057, - "learning_rate": 4.1381818181818185e-06, - "loss": 0.0658, - "step": 1613 - }, - { - "epoch": 29.345454545454544, - "grad_norm": 1.9815759658813477, - "learning_rate": 4.134545454545455e-06, - "loss": 0.0442, - "step": 1614 - }, - { - "epoch": 29.363636363636363, - "grad_norm": 3.2399818897247314, - "learning_rate": 4.130909090909091e-06, - "loss": 0.0662, - "step": 1615 - }, - { - "epoch": 29.381818181818183, - "grad_norm": 2.3948585987091064, - "learning_rate": 4.127272727272728e-06, - "loss": 0.0759, - "step": 1616 - }, - { - "epoch": 29.4, - "grad_norm": 1.2121988534927368, - "learning_rate": 4.1236363636363645e-06, - "loss": 0.0706, - "step": 1617 - }, - { - "epoch": 29.418181818181818, - "grad_norm": 1.5026443004608154, - "learning_rate": 4.12e-06, - "loss": 0.0607, - "step": 1618 - }, - { - "epoch": 29.436363636363637, - "grad_norm": 2.7839465141296387, - "learning_rate": 4.116363636363637e-06, - "loss": 0.0441, - "step": 1619 - }, - { - "epoch": 29.454545454545453, - "grad_norm": 2.233424186706543, - "learning_rate": 4.112727272727273e-06, - "loss": 0.0541, - "step": 1620 - }, - { - "epoch": 29.472727272727273, - "grad_norm": 1.5179940462112427, - "learning_rate": 4.10909090909091e-06, - "loss": 0.0497, - "step": 1621 - }, - { - "epoch": 29.490909090909092, - "grad_norm": 0.6349920630455017, - "learning_rate": 4.1054545454545455e-06, - "loss": 0.0745, - "step": 1622 - }, - { - "epoch": 29.509090909090908, - "grad_norm": 1.7921110391616821, - "learning_rate": 4.101818181818182e-06, - "loss": 0.0585, - "step": 1623 - }, - { - "epoch": 29.527272727272727, - "grad_norm": 1.4845821857452393, - "learning_rate": 4.098181818181818e-06, - "loss": 0.0673, - "step": 1624 - }, - { - "epoch": 29.545454545454547, - "grad_norm": 2.5908501148223877, - "learning_rate": 4.094545454545455e-06, - "loss": 0.0546, - "step": 1625 - }, - { - "epoch": 29.563636363636363, - "grad_norm": 1.4697266817092896, - "learning_rate": 4.0909090909090915e-06, - "loss": 0.067, - "step": 1626 - }, - { - "epoch": 29.581818181818182, - "grad_norm": 0.35375556349754333, - "learning_rate": 4.087272727272727e-06, - "loss": 0.0547, - "step": 1627 - }, - { - "epoch": 29.6, - "grad_norm": 1.718665599822998, - "learning_rate": 4.083636363636364e-06, - "loss": 0.046, - "step": 1628 - }, - { - "epoch": 29.618181818181817, - "grad_norm": 2.092567205429077, - "learning_rate": 4.08e-06, - "loss": 0.0567, - "step": 1629 - }, - { - "epoch": 29.636363636363637, - "grad_norm": 0.47607341408729553, - "learning_rate": 4.076363636363637e-06, - "loss": 0.0612, - "step": 1630 - }, - { - "epoch": 29.654545454545456, - "grad_norm": 0.6008872389793396, - "learning_rate": 4.072727272727273e-06, - "loss": 0.0827, - "step": 1631 - }, - { - "epoch": 29.672727272727272, - "grad_norm": 2.8049259185791016, - "learning_rate": 4.069090909090909e-06, - "loss": 0.048, - "step": 1632 - }, - { - "epoch": 29.69090909090909, - "grad_norm": 0.6018053293228149, - "learning_rate": 4.065454545454546e-06, - "loss": 0.0634, - "step": 1633 - }, - { - "epoch": 29.70909090909091, - "grad_norm": 1.256656527519226, - "learning_rate": 4.061818181818183e-06, - "loss": 0.0553, - "step": 1634 - }, - { - "epoch": 29.727272727272727, - "grad_norm": 0.31160464882850647, - "learning_rate": 4.0581818181818184e-06, - "loss": 0.0644, - "step": 1635 - }, - { - "epoch": 29.745454545454546, - "grad_norm": 0.4514331817626953, - "learning_rate": 4.054545454545455e-06, - "loss": 0.0506, - "step": 1636 - }, - { - "epoch": 29.763636363636365, - "grad_norm": 3.2675395011901855, - "learning_rate": 4.050909090909092e-06, - "loss": 0.0753, - "step": 1637 - }, - { - "epoch": 29.78181818181818, - "grad_norm": 3.383963108062744, - "learning_rate": 4.047272727272728e-06, - "loss": 0.0757, - "step": 1638 - }, - { - "epoch": 29.8, - "grad_norm": 1.1676076650619507, - "learning_rate": 4.0436363636363644e-06, - "loss": 0.0605, - "step": 1639 - }, - { - "epoch": 29.818181818181817, - "grad_norm": 1.8204801082611084, - "learning_rate": 4.04e-06, - "loss": 0.0628, - "step": 1640 - }, - { - "epoch": 29.836363636363636, - "grad_norm": 2.217397689819336, - "learning_rate": 4.036363636363637e-06, - "loss": 0.0674, - "step": 1641 - }, - { - "epoch": 29.854545454545455, - "grad_norm": 0.7834932804107666, - "learning_rate": 4.032727272727273e-06, - "loss": 0.0513, - "step": 1642 - }, - { - "epoch": 29.87272727272727, - "grad_norm": 1.1105726957321167, - "learning_rate": 4.0290909090909096e-06, - "loss": 0.0554, - "step": 1643 - }, - { - "epoch": 29.89090909090909, - "grad_norm": 0.6671727895736694, - "learning_rate": 4.025454545454545e-06, - "loss": 0.0609, - "step": 1644 - }, - { - "epoch": 29.90909090909091, - "grad_norm": 1.4964550733566284, - "learning_rate": 4.021818181818182e-06, - "loss": 0.062, - "step": 1645 - }, - { - "epoch": 29.927272727272726, - "grad_norm": 0.7812409996986389, - "learning_rate": 4.018181818181818e-06, - "loss": 0.059, - "step": 1646 - }, - { - "epoch": 29.945454545454545, - "grad_norm": 0.46275824308395386, - "learning_rate": 4.014545454545455e-06, - "loss": 0.0668, - "step": 1647 - }, - { - "epoch": 29.963636363636365, - "grad_norm": 2.7484638690948486, - "learning_rate": 4.010909090909091e-06, - "loss": 0.0669, - "step": 1648 - }, - { - "epoch": 29.98181818181818, - "grad_norm": 1.584599256515503, - "learning_rate": 4.007272727272727e-06, - "loss": 0.0514, - "step": 1649 - }, - { - "epoch": 30.0, - "grad_norm": 1.1669526100158691, - "learning_rate": 4.003636363636364e-06, - "loss": 0.0387, - "step": 1650 - }, - { - "epoch": 30.0, - "eval_loss": 0.0566595233976841, - "eval_runtime": 9.5971, - "eval_samples_per_second": 567.361, - "eval_steps_per_second": 70.959, - "step": 1650 - }, - { - "epoch": 30.01818181818182, - "grad_norm": 0.8033711314201355, - "learning_rate": 4.000000000000001e-06, - "loss": 0.0527, - "step": 1651 - }, - { - "epoch": 30.036363636363635, - "grad_norm": 0.31837260723114014, - "learning_rate": 3.9963636363636365e-06, - "loss": 0.063, - "step": 1652 - }, - { - "epoch": 30.054545454545455, - "grad_norm": 0.8697006106376648, - "learning_rate": 3.992727272727273e-06, - "loss": 0.0482, - "step": 1653 - }, - { - "epoch": 30.072727272727274, - "grad_norm": 0.31759750843048096, - "learning_rate": 3.98909090909091e-06, - "loss": 0.0624, - "step": 1654 - }, - { - "epoch": 30.09090909090909, - "grad_norm": 0.21320608258247375, - "learning_rate": 3.985454545454546e-06, - "loss": 0.0637, - "step": 1655 - }, - { - "epoch": 30.10909090909091, - "grad_norm": 1.5872488021850586, - "learning_rate": 3.9818181818181825e-06, - "loss": 0.0603, - "step": 1656 - }, - { - "epoch": 30.12727272727273, - "grad_norm": 0.25769850611686707, - "learning_rate": 3.978181818181818e-06, - "loss": 0.0619, - "step": 1657 - }, - { - "epoch": 30.145454545454545, - "grad_norm": 1.8010194301605225, - "learning_rate": 3.974545454545455e-06, - "loss": 0.0543, - "step": 1658 - }, - { - "epoch": 30.163636363636364, - "grad_norm": 0.6169752478599548, - "learning_rate": 3.970909090909092e-06, - "loss": 0.0776, - "step": 1659 - }, - { - "epoch": 30.181818181818183, - "grad_norm": 0.5068159699440002, - "learning_rate": 3.967272727272728e-06, - "loss": 0.0421, - "step": 1660 - }, - { - "epoch": 30.2, - "grad_norm": 1.2063308954238892, - "learning_rate": 3.963636363636364e-06, - "loss": 0.0482, - "step": 1661 - }, - { - "epoch": 30.21818181818182, - "grad_norm": 2.360495090484619, - "learning_rate": 3.96e-06, - "loss": 0.0634, - "step": 1662 - }, - { - "epoch": 30.236363636363638, - "grad_norm": 1.6216001510620117, - "learning_rate": 3.956363636363637e-06, - "loss": 0.0503, - "step": 1663 - }, - { - "epoch": 30.254545454545454, - "grad_norm": 1.6206556558609009, - "learning_rate": 3.952727272727273e-06, - "loss": 0.0766, - "step": 1664 - }, - { - "epoch": 30.272727272727273, - "grad_norm": 0.6286831498146057, - "learning_rate": 3.9490909090909095e-06, - "loss": 0.0752, - "step": 1665 - }, - { - "epoch": 30.29090909090909, - "grad_norm": 1.6825350522994995, - "learning_rate": 3.945454545454545e-06, - "loss": 0.0544, - "step": 1666 - }, - { - "epoch": 30.30909090909091, - "grad_norm": 0.3762204647064209, - "learning_rate": 3.941818181818182e-06, - "loss": 0.0649, - "step": 1667 - }, - { - "epoch": 30.327272727272728, - "grad_norm": 2.089960813522339, - "learning_rate": 3.938181818181819e-06, - "loss": 0.0443, - "step": 1668 - }, - { - "epoch": 30.345454545454544, - "grad_norm": 0.31990402936935425, - "learning_rate": 3.934545454545455e-06, - "loss": 0.0603, - "step": 1669 - }, - { - "epoch": 30.363636363636363, - "grad_norm": 0.285463809967041, - "learning_rate": 3.930909090909091e-06, - "loss": 0.0499, - "step": 1670 - }, - { - "epoch": 30.381818181818183, - "grad_norm": 2.100041627883911, - "learning_rate": 3.927272727272727e-06, - "loss": 0.0492, - "step": 1671 - }, - { - "epoch": 30.4, - "grad_norm": 0.24273833632469177, - "learning_rate": 3.923636363636364e-06, - "loss": 0.0642, - "step": 1672 - }, - { - "epoch": 30.418181818181818, - "grad_norm": 1.6747268438339233, - "learning_rate": 3.920000000000001e-06, - "loss": 0.0604, - "step": 1673 - }, - { - "epoch": 30.436363636363637, - "grad_norm": 0.7674861550331116, - "learning_rate": 3.9163636363636364e-06, - "loss": 0.0643, - "step": 1674 - }, - { - "epoch": 30.454545454545453, - "grad_norm": 1.1953601837158203, - "learning_rate": 3.912727272727273e-06, - "loss": 0.0673, - "step": 1675 - }, - { - "epoch": 30.472727272727273, - "grad_norm": 0.9709298610687256, - "learning_rate": 3.90909090909091e-06, - "loss": 0.0592, - "step": 1676 - }, - { - "epoch": 30.490909090909092, - "grad_norm": 1.0155771970748901, - "learning_rate": 3.905454545454546e-06, - "loss": 0.0519, - "step": 1677 - }, - { - "epoch": 30.509090909090908, - "grad_norm": 1.0697407722473145, - "learning_rate": 3.901818181818182e-06, - "loss": 0.0482, - "step": 1678 - }, - { - "epoch": 30.527272727272727, - "grad_norm": 0.2534489035606384, - "learning_rate": 3.898181818181819e-06, - "loss": 0.0432, - "step": 1679 - }, - { - "epoch": 30.545454545454547, - "grad_norm": 0.8741542100906372, - "learning_rate": 3.894545454545455e-06, - "loss": 0.0533, - "step": 1680 - }, - { - "epoch": 30.563636363636363, - "grad_norm": 1.0032343864440918, - "learning_rate": 3.890909090909092e-06, - "loss": 0.071, - "step": 1681 - }, - { - "epoch": 30.581818181818182, - "grad_norm": 0.4763041138648987, - "learning_rate": 3.8872727272727276e-06, - "loss": 0.0557, - "step": 1682 - }, - { - "epoch": 30.6, - "grad_norm": 1.3324546813964844, - "learning_rate": 3.883636363636364e-06, - "loss": 0.0643, - "step": 1683 - }, - { - "epoch": 30.618181818181817, - "grad_norm": 2.160158634185791, - "learning_rate": 3.88e-06, - "loss": 0.0654, - "step": 1684 - }, - { - "epoch": 30.636363636363637, - "grad_norm": 0.4547255039215088, - "learning_rate": 3.876363636363637e-06, - "loss": 0.0669, - "step": 1685 - }, - { - "epoch": 30.654545454545456, - "grad_norm": 0.3728177845478058, - "learning_rate": 3.872727272727273e-06, - "loss": 0.0592, - "step": 1686 - }, - { - "epoch": 30.672727272727272, - "grad_norm": 1.8757072687149048, - "learning_rate": 3.869090909090909e-06, - "loss": 0.0814, - "step": 1687 - }, - { - "epoch": 30.69090909090909, - "grad_norm": 1.2797826528549194, - "learning_rate": 3.865454545454545e-06, - "loss": 0.065, - "step": 1688 - }, - { - "epoch": 30.70909090909091, - "grad_norm": 0.18825581669807434, - "learning_rate": 3.861818181818182e-06, - "loss": 0.0563, - "step": 1689 - }, - { - "epoch": 30.727272727272727, - "grad_norm": 0.664044976234436, - "learning_rate": 3.858181818181819e-06, - "loss": 0.0724, - "step": 1690 - }, - { - "epoch": 30.745454545454546, - "grad_norm": 2.580392360687256, - "learning_rate": 3.8545454545454545e-06, - "loss": 0.0533, - "step": 1691 - }, - { - "epoch": 30.763636363636365, - "grad_norm": 1.9529935121536255, - "learning_rate": 3.850909090909091e-06, - "loss": 0.058, - "step": 1692 - }, - { - "epoch": 30.78181818181818, - "grad_norm": 0.5996452569961548, - "learning_rate": 3.847272727272728e-06, - "loss": 0.056, - "step": 1693 - }, - { - "epoch": 30.8, - "grad_norm": 1.5448031425476074, - "learning_rate": 3.843636363636364e-06, - "loss": 0.0567, - "step": 1694 - }, - { - "epoch": 30.818181818181817, - "grad_norm": 0.42574429512023926, - "learning_rate": 3.8400000000000005e-06, - "loss": 0.0524, - "step": 1695 - }, - { - "epoch": 30.836363636363636, - "grad_norm": 0.5064520835876465, - "learning_rate": 3.836363636363636e-06, - "loss": 0.0607, - "step": 1696 - }, - { - "epoch": 30.854545454545455, - "grad_norm": 0.5658007860183716, - "learning_rate": 3.832727272727273e-06, - "loss": 0.061, - "step": 1697 - }, - { - "epoch": 30.87272727272727, - "grad_norm": 1.9731327295303345, - "learning_rate": 3.82909090909091e-06, - "loss": 0.0539, - "step": 1698 - }, - { - "epoch": 30.89090909090909, - "grad_norm": 1.788385033607483, - "learning_rate": 3.825454545454546e-06, - "loss": 0.0693, - "step": 1699 - }, - { - "epoch": 30.90909090909091, - "grad_norm": 0.8219048380851746, - "learning_rate": 3.821818181818182e-06, - "loss": 0.066, - "step": 1700 - }, - { - "epoch": 30.927272727272726, - "grad_norm": 0.9388870000839233, - "learning_rate": 3.818181818181819e-06, - "loss": 0.0606, - "step": 1701 - }, - { - "epoch": 30.945454545454545, - "grad_norm": 2.8804731369018555, - "learning_rate": 3.8145454545454545e-06, - "loss": 0.0558, - "step": 1702 - }, - { - "epoch": 30.963636363636365, - "grad_norm": 2.756359815597534, - "learning_rate": 3.810909090909091e-06, - "loss": 0.0568, - "step": 1703 - }, - { - "epoch": 30.98181818181818, - "grad_norm": 1.4934144020080566, - "learning_rate": 3.807272727272728e-06, - "loss": 0.0772, - "step": 1704 - }, - { - "epoch": 31.0, - "grad_norm": 2.284065008163452, - "learning_rate": 3.8036363636363638e-06, - "loss": 0.0693, - "step": 1705 - }, - { - "epoch": 31.0, - "eval_loss": 0.06086741387844086, - "eval_runtime": 9.3422, - "eval_samples_per_second": 582.839, - "eval_steps_per_second": 72.895, - "step": 1705 - }, - { - "epoch": 31.01818181818182, - "grad_norm": 2.9924073219299316, - "learning_rate": 3.8000000000000005e-06, - "loss": 0.0555, - "step": 1706 - }, - { - "epoch": 31.036363636363635, - "grad_norm": 2.812579870223999, - "learning_rate": 3.7963636363636367e-06, - "loss": 0.0718, - "step": 1707 - }, - { - "epoch": 31.054545454545455, - "grad_norm": 1.3866796493530273, - "learning_rate": 3.792727272727273e-06, - "loss": 0.0858, - "step": 1708 - }, - { - "epoch": 31.072727272727274, - "grad_norm": 0.2944263815879822, - "learning_rate": 3.7890909090909093e-06, - "loss": 0.0412, - "step": 1709 - }, - { - "epoch": 31.09090909090909, - "grad_norm": 2.1752920150756836, - "learning_rate": 3.7854545454545456e-06, - "loss": 0.0573, - "step": 1710 - }, - { - "epoch": 31.10909090909091, - "grad_norm": 3.250051259994507, - "learning_rate": 3.781818181818182e-06, - "loss": 0.0675, - "step": 1711 - }, - { - "epoch": 31.12727272727273, - "grad_norm": 1.67738676071167, - "learning_rate": 3.7781818181818186e-06, - "loss": 0.059, - "step": 1712 - }, - { - "epoch": 31.145454545454545, - "grad_norm": 0.5501904487609863, - "learning_rate": 3.7745454545454544e-06, - "loss": 0.0478, - "step": 1713 - }, - { - "epoch": 31.163636363636364, - "grad_norm": 1.3530913591384888, - "learning_rate": 3.770909090909091e-06, - "loss": 0.0659, - "step": 1714 - }, - { - "epoch": 31.181818181818183, - "grad_norm": 2.345813035964966, - "learning_rate": 3.767272727272728e-06, - "loss": 0.0656, - "step": 1715 - }, - { - "epoch": 31.2, - "grad_norm": 0.642487645149231, - "learning_rate": 3.7636363636363637e-06, - "loss": 0.0747, - "step": 1716 - }, - { - "epoch": 31.21818181818182, - "grad_norm": 0.7950825095176697, - "learning_rate": 3.7600000000000004e-06, - "loss": 0.0706, - "step": 1717 - }, - { - "epoch": 31.236363636363638, - "grad_norm": 0.6911962628364563, - "learning_rate": 3.7563636363636367e-06, - "loss": 0.0558, - "step": 1718 - }, - { - "epoch": 31.254545454545454, - "grad_norm": 1.0188809633255005, - "learning_rate": 3.752727272727273e-06, - "loss": 0.0542, - "step": 1719 - }, - { - "epoch": 31.272727272727273, - "grad_norm": 3.2567858695983887, - "learning_rate": 3.7490909090909093e-06, - "loss": 0.0662, - "step": 1720 - }, - { - "epoch": 31.29090909090909, - "grad_norm": 1.1037007570266724, - "learning_rate": 3.745454545454546e-06, - "loss": 0.0593, - "step": 1721 - }, - { - "epoch": 31.30909090909091, - "grad_norm": 0.3703712821006775, - "learning_rate": 3.741818181818182e-06, - "loss": 0.07, - "step": 1722 - }, - { - "epoch": 31.327272727272728, - "grad_norm": 2.291598081588745, - "learning_rate": 3.7381818181818185e-06, - "loss": 0.0513, - "step": 1723 - }, - { - "epoch": 31.345454545454544, - "grad_norm": 2.0262324810028076, - "learning_rate": 3.7345454545454544e-06, - "loss": 0.073, - "step": 1724 - }, - { - "epoch": 31.363636363636363, - "grad_norm": 2.335026741027832, - "learning_rate": 3.730909090909091e-06, - "loss": 0.0644, - "step": 1725 - }, - { - "epoch": 31.381818181818183, - "grad_norm": 1.2442067861557007, - "learning_rate": 3.727272727272728e-06, - "loss": 0.0507, - "step": 1726 - }, - { - "epoch": 31.4, - "grad_norm": 1.7962726354599, - "learning_rate": 3.7236363636363637e-06, - "loss": 0.0571, - "step": 1727 - }, - { - "epoch": 31.418181818181818, - "grad_norm": 3.5134005546569824, - "learning_rate": 3.7200000000000004e-06, - "loss": 0.063, - "step": 1728 - }, - { - "epoch": 31.436363636363637, - "grad_norm": 3.3000383377075195, - "learning_rate": 3.7163636363636367e-06, - "loss": 0.0572, - "step": 1729 - }, - { - "epoch": 31.454545454545453, - "grad_norm": 2.170186758041382, - "learning_rate": 3.712727272727273e-06, - "loss": 0.048, - "step": 1730 - }, - { - "epoch": 31.472727272727273, - "grad_norm": 0.4322919249534607, - "learning_rate": 3.7090909090909092e-06, - "loss": 0.0568, - "step": 1731 - }, - { - "epoch": 31.490909090909092, - "grad_norm": 1.1450276374816895, - "learning_rate": 3.705454545454546e-06, - "loss": 0.0676, - "step": 1732 - }, - { - "epoch": 31.509090909090908, - "grad_norm": 0.686449408531189, - "learning_rate": 3.701818181818182e-06, - "loss": 0.052, - "step": 1733 - }, - { - "epoch": 31.527272727272727, - "grad_norm": 0.8507606983184814, - "learning_rate": 3.6981818181818185e-06, - "loss": 0.0538, - "step": 1734 - }, - { - "epoch": 31.545454545454547, - "grad_norm": 0.9597150087356567, - "learning_rate": 3.694545454545455e-06, - "loss": 0.0632, - "step": 1735 - }, - { - "epoch": 31.563636363636363, - "grad_norm": 1.0175511837005615, - "learning_rate": 3.690909090909091e-06, - "loss": 0.05, - "step": 1736 - }, - { - "epoch": 31.581818181818182, - "grad_norm": 0.29139402508735657, - "learning_rate": 3.6872727272727278e-06, - "loss": 0.0541, - "step": 1737 - }, - { - "epoch": 31.6, - "grad_norm": 1.0778769254684448, - "learning_rate": 3.6836363636363636e-06, - "loss": 0.0739, - "step": 1738 - }, - { - "epoch": 31.618181818181817, - "grad_norm": 0.7249436974525452, - "learning_rate": 3.6800000000000003e-06, - "loss": 0.0669, - "step": 1739 - }, - { - "epoch": 31.636363636363637, - "grad_norm": 0.6636848449707031, - "learning_rate": 3.6763636363636366e-06, - "loss": 0.0561, - "step": 1740 - }, - { - "epoch": 31.654545454545456, - "grad_norm": 0.47538477182388306, - "learning_rate": 3.672727272727273e-06, - "loss": 0.0541, - "step": 1741 - }, - { - "epoch": 31.672727272727272, - "grad_norm": 1.086026668548584, - "learning_rate": 3.669090909090909e-06, - "loss": 0.0747, - "step": 1742 - }, - { - "epoch": 31.69090909090909, - "grad_norm": 1.2165541648864746, - "learning_rate": 3.665454545454546e-06, - "loss": 0.0702, - "step": 1743 - }, - { - "epoch": 31.70909090909091, - "grad_norm": 1.0969233512878418, - "learning_rate": 3.6618181818181818e-06, - "loss": 0.064, - "step": 1744 - }, - { - "epoch": 31.727272727272727, - "grad_norm": 0.6842359900474548, - "learning_rate": 3.6581818181818185e-06, - "loss": 0.0467, - "step": 1745 - }, - { - "epoch": 31.745454545454546, - "grad_norm": 0.5409308671951294, - "learning_rate": 3.654545454545455e-06, - "loss": 0.0552, - "step": 1746 - }, - { - "epoch": 31.763636363636365, - "grad_norm": 3.021909713745117, - "learning_rate": 3.650909090909091e-06, - "loss": 0.0524, - "step": 1747 - }, - { - "epoch": 31.78181818181818, - "grad_norm": 2.8254475593566895, - "learning_rate": 3.6472727272727277e-06, - "loss": 0.0768, - "step": 1748 - }, - { - "epoch": 31.8, - "grad_norm": 2.9707794189453125, - "learning_rate": 3.643636363636364e-06, - "loss": 0.0604, - "step": 1749 - }, - { - "epoch": 31.818181818181817, - "grad_norm": 1.328602910041809, - "learning_rate": 3.6400000000000003e-06, - "loss": 0.047, - "step": 1750 - }, - { - "epoch": 31.836363636363636, - "grad_norm": 2.9099009037017822, - "learning_rate": 3.6363636363636366e-06, - "loss": 0.055, - "step": 1751 - }, - { - "epoch": 31.854545454545455, - "grad_norm": 1.204310655593872, - "learning_rate": 3.632727272727273e-06, - "loss": 0.0516, - "step": 1752 - }, - { - "epoch": 31.87272727272727, - "grad_norm": 3.0262045860290527, - "learning_rate": 3.629090909090909e-06, - "loss": 0.053, - "step": 1753 - }, - { - "epoch": 31.89090909090909, - "grad_norm": 1.3845661878585815, - "learning_rate": 3.625454545454546e-06, - "loss": 0.049, - "step": 1754 - }, - { - "epoch": 31.90909090909091, - "grad_norm": 0.8351224660873413, - "learning_rate": 3.6218181818181817e-06, - "loss": 0.0486, - "step": 1755 - }, - { - "epoch": 31.927272727272726, - "grad_norm": 0.9184449911117554, - "learning_rate": 3.6181818181818184e-06, - "loss": 0.0603, - "step": 1756 - }, - { - "epoch": 31.945454545454545, - "grad_norm": 1.6077629327774048, - "learning_rate": 3.614545454545455e-06, - "loss": 0.0567, - "step": 1757 - }, - { - "epoch": 31.963636363636365, - "grad_norm": 0.22140157222747803, - "learning_rate": 3.610909090909091e-06, - "loss": 0.0683, - "step": 1758 - }, - { - "epoch": 31.98181818181818, - "grad_norm": 1.7452433109283447, - "learning_rate": 3.6072727272727277e-06, - "loss": 0.0518, - "step": 1759 - }, - { - "epoch": 32.0, - "grad_norm": 0.41989436745643616, - "learning_rate": 3.603636363636364e-06, - "loss": 0.083, - "step": 1760 - }, - { - "epoch": 32.0, - "eval_loss": 0.056669317185878754, - "eval_runtime": 9.5886, - "eval_samples_per_second": 567.863, - "eval_steps_per_second": 71.022, - "step": 1760 - }, - { - "epoch": 32.018181818181816, - "grad_norm": 0.31750914454460144, - "learning_rate": 3.6000000000000003e-06, - "loss": 0.0673, - "step": 1761 - }, - { - "epoch": 32.03636363636364, - "grad_norm": 1.2970309257507324, - "learning_rate": 3.5963636363636365e-06, - "loss": 0.0554, - "step": 1762 - }, - { - "epoch": 32.054545454545455, - "grad_norm": 0.4423520863056183, - "learning_rate": 3.5927272727272733e-06, - "loss": 0.0509, - "step": 1763 - }, - { - "epoch": 32.07272727272727, - "grad_norm": 0.49937716126441956, - "learning_rate": 3.589090909090909e-06, - "loss": 0.0504, - "step": 1764 - }, - { - "epoch": 32.09090909090909, - "grad_norm": 3.2576940059661865, - "learning_rate": 3.585454545454546e-06, - "loss": 0.0726, - "step": 1765 - }, - { - "epoch": 32.10909090909091, - "grad_norm": 2.061427593231201, - "learning_rate": 3.5818181818181817e-06, - "loss": 0.064, - "step": 1766 - }, - { - "epoch": 32.127272727272725, - "grad_norm": 1.7316759824752808, - "learning_rate": 3.5781818181818184e-06, - "loss": 0.0586, - "step": 1767 - }, - { - "epoch": 32.14545454545455, - "grad_norm": 0.21755695343017578, - "learning_rate": 3.574545454545455e-06, - "loss": 0.0658, - "step": 1768 - }, - { - "epoch": 32.163636363636364, - "grad_norm": 0.8829976320266724, - "learning_rate": 3.570909090909091e-06, - "loss": 0.0392, - "step": 1769 - }, - { - "epoch": 32.18181818181818, - "grad_norm": 0.7784296870231628, - "learning_rate": 3.5672727272727277e-06, - "loss": 0.0617, - "step": 1770 - }, - { - "epoch": 32.2, - "grad_norm": 0.9451490044593811, - "learning_rate": 3.563636363636364e-06, - "loss": 0.0566, - "step": 1771 - }, - { - "epoch": 32.21818181818182, - "grad_norm": 0.9517245888710022, - "learning_rate": 3.5600000000000002e-06, - "loss": 0.0369, - "step": 1772 - }, - { - "epoch": 32.236363636363635, - "grad_norm": 2.8129894733428955, - "learning_rate": 3.5563636363636365e-06, - "loss": 0.0663, - "step": 1773 - }, - { - "epoch": 32.25454545454546, - "grad_norm": 1.4583874940872192, - "learning_rate": 3.552727272727273e-06, - "loss": 0.0634, - "step": 1774 - }, - { - "epoch": 32.27272727272727, - "grad_norm": 2.0600342750549316, - "learning_rate": 3.549090909090909e-06, - "loss": 0.0745, - "step": 1775 - }, - { - "epoch": 32.29090909090909, - "grad_norm": 1.8947570323944092, - "learning_rate": 3.5454545454545458e-06, - "loss": 0.0802, - "step": 1776 - }, - { - "epoch": 32.30909090909091, - "grad_norm": 2.6806042194366455, - "learning_rate": 3.5418181818181825e-06, - "loss": 0.053, - "step": 1777 - }, - { - "epoch": 32.32727272727273, - "grad_norm": 1.298846960067749, - "learning_rate": 3.5381818181818183e-06, - "loss": 0.0769, - "step": 1778 - }, - { - "epoch": 32.345454545454544, - "grad_norm": 2.6742966175079346, - "learning_rate": 3.534545454545455e-06, - "loss": 0.0488, - "step": 1779 - }, - { - "epoch": 32.36363636363637, - "grad_norm": 1.8014206886291504, - "learning_rate": 3.530909090909091e-06, - "loss": 0.055, - "step": 1780 - }, - { - "epoch": 32.38181818181818, - "grad_norm": 0.7767223119735718, - "learning_rate": 3.5272727272727276e-06, - "loss": 0.0478, - "step": 1781 - }, - { - "epoch": 32.4, - "grad_norm": 2.0346508026123047, - "learning_rate": 3.523636363636364e-06, - "loss": 0.0532, - "step": 1782 - }, - { - "epoch": 32.41818181818182, - "grad_norm": 0.46261894702911377, - "learning_rate": 3.52e-06, - "loss": 0.045, - "step": 1783 - }, - { - "epoch": 32.43636363636364, - "grad_norm": 1.9721901416778564, - "learning_rate": 3.5163636363636365e-06, - "loss": 0.061, - "step": 1784 - }, - { - "epoch": 32.45454545454545, - "grad_norm": 0.383468896150589, - "learning_rate": 3.512727272727273e-06, - "loss": 0.0529, - "step": 1785 - }, - { - "epoch": 32.472727272727276, - "grad_norm": 0.3963320255279541, - "learning_rate": 3.509090909090909e-06, - "loss": 0.0736, - "step": 1786 - }, - { - "epoch": 32.49090909090909, - "grad_norm": 1.0956697463989258, - "learning_rate": 3.5054545454545457e-06, - "loss": 0.0618, - "step": 1787 - }, - { - "epoch": 32.50909090909091, - "grad_norm": 0.3084043562412262, - "learning_rate": 3.5018181818181824e-06, - "loss": 0.0513, - "step": 1788 - }, - { - "epoch": 32.527272727272724, - "grad_norm": 0.18538255989551544, - "learning_rate": 3.4981818181818183e-06, - "loss": 0.0514, - "step": 1789 - }, - { - "epoch": 32.54545454545455, - "grad_norm": 1.4432954788208008, - "learning_rate": 3.494545454545455e-06, - "loss": 0.073, - "step": 1790 - }, - { - "epoch": 32.56363636363636, - "grad_norm": 1.51544189453125, - "learning_rate": 3.4909090909090913e-06, - "loss": 0.0715, - "step": 1791 - }, - { - "epoch": 32.58181818181818, - "grad_norm": 0.9372082352638245, - "learning_rate": 3.4872727272727276e-06, - "loss": 0.059, - "step": 1792 - }, - { - "epoch": 32.6, - "grad_norm": 1.7225313186645508, - "learning_rate": 3.483636363636364e-06, - "loss": 0.0593, - "step": 1793 - }, - { - "epoch": 32.61818181818182, - "grad_norm": 1.5657143592834473, - "learning_rate": 3.48e-06, - "loss": 0.0611, - "step": 1794 - }, - { - "epoch": 32.63636363636363, - "grad_norm": 0.27677929401397705, - "learning_rate": 3.4763636363636364e-06, - "loss": 0.0501, - "step": 1795 - }, - { - "epoch": 32.654545454545456, - "grad_norm": 1.688311219215393, - "learning_rate": 3.472727272727273e-06, - "loss": 0.0674, - "step": 1796 - }, - { - "epoch": 32.67272727272727, - "grad_norm": 2.120985746383667, - "learning_rate": 3.469090909090909e-06, - "loss": 0.0579, - "step": 1797 - }, - { - "epoch": 32.69090909090909, - "grad_norm": 0.2686808407306671, - "learning_rate": 3.4654545454545457e-06, - "loss": 0.0642, - "step": 1798 - }, - { - "epoch": 32.70909090909091, - "grad_norm": 0.4087761342525482, - "learning_rate": 3.4618181818181824e-06, - "loss": 0.049, - "step": 1799 - }, - { - "epoch": 32.72727272727273, - "grad_norm": 0.6558623909950256, - "learning_rate": 3.4581818181818183e-06, - "loss": 0.0579, - "step": 1800 - }, - { - "epoch": 32.74545454545454, - "grad_norm": 0.7098785638809204, - "learning_rate": 3.454545454545455e-06, - "loss": 0.0531, - "step": 1801 - }, - { - "epoch": 32.763636363636365, - "grad_norm": 0.4082554876804352, - "learning_rate": 3.4509090909090912e-06, - "loss": 0.066, - "step": 1802 - }, - { - "epoch": 32.78181818181818, - "grad_norm": 0.26292142271995544, - "learning_rate": 3.4472727272727275e-06, - "loss": 0.0655, - "step": 1803 - }, - { - "epoch": 32.8, - "grad_norm": 0.4958662688732147, - "learning_rate": 3.443636363636364e-06, - "loss": 0.055, - "step": 1804 - }, - { - "epoch": 32.81818181818182, - "grad_norm": 1.9362311363220215, - "learning_rate": 3.44e-06, - "loss": 0.0607, - "step": 1805 - }, - { - "epoch": 32.836363636363636, - "grad_norm": 0.7056371569633484, - "learning_rate": 3.4363636363636364e-06, - "loss": 0.0493, - "step": 1806 - }, - { - "epoch": 32.85454545454545, - "grad_norm": 0.2841050624847412, - "learning_rate": 3.432727272727273e-06, - "loss": 0.0592, - "step": 1807 - }, - { - "epoch": 32.872727272727275, - "grad_norm": 0.841806173324585, - "learning_rate": 3.429090909090909e-06, - "loss": 0.0746, - "step": 1808 - }, - { - "epoch": 32.89090909090909, - "grad_norm": 0.5804309248924255, - "learning_rate": 3.4254545454545457e-06, - "loss": 0.0895, - "step": 1809 - }, - { - "epoch": 32.90909090909091, - "grad_norm": 1.988023042678833, - "learning_rate": 3.4218181818181824e-06, - "loss": 0.0582, - "step": 1810 - }, - { - "epoch": 32.92727272727273, - "grad_norm": 0.34771525859832764, - "learning_rate": 3.4181818181818182e-06, - "loss": 0.0694, - "step": 1811 - }, - { - "epoch": 32.945454545454545, - "grad_norm": 0.19159702956676483, - "learning_rate": 3.414545454545455e-06, - "loss": 0.04, - "step": 1812 - }, - { - "epoch": 32.96363636363636, - "grad_norm": 1.0464740991592407, - "learning_rate": 3.410909090909091e-06, - "loss": 0.0486, - "step": 1813 - }, - { - "epoch": 32.981818181818184, - "grad_norm": 0.9412320852279663, - "learning_rate": 3.4072727272727275e-06, - "loss": 0.0527, - "step": 1814 - }, - { - "epoch": 33.0, - "grad_norm": 1.0168473720550537, - "learning_rate": 3.4036363636363638e-06, - "loss": 0.0515, - "step": 1815 - }, - { - "epoch": 33.0, - "eval_loss": 0.05685354769229889, - "eval_runtime": 9.4813, - "eval_samples_per_second": 574.287, - "eval_steps_per_second": 71.825, - "step": 1815 - }, - { - "epoch": 33.018181818181816, - "grad_norm": 0.6542579531669617, - "learning_rate": 3.4000000000000005e-06, - "loss": 0.0704, - "step": 1816 - }, - { - "epoch": 33.03636363636364, - "grad_norm": 2.483635902404785, - "learning_rate": 3.3963636363636363e-06, - "loss": 0.0672, - "step": 1817 - }, - { - "epoch": 33.054545454545455, - "grad_norm": 1.5502185821533203, - "learning_rate": 3.392727272727273e-06, - "loss": 0.0583, - "step": 1818 - }, - { - "epoch": 33.07272727272727, - "grad_norm": 1.931560754776001, - "learning_rate": 3.389090909090909e-06, - "loss": 0.0615, - "step": 1819 - }, - { - "epoch": 33.09090909090909, - "grad_norm": 0.49402159452438354, - "learning_rate": 3.3854545454545456e-06, - "loss": 0.0479, - "step": 1820 - }, - { - "epoch": 33.10909090909091, - "grad_norm": 2.1230928897857666, - "learning_rate": 3.3818181818181823e-06, - "loss": 0.0635, - "step": 1821 - }, - { - "epoch": 33.127272727272725, - "grad_norm": 3.6523661613464355, - "learning_rate": 3.378181818181818e-06, - "loss": 0.0601, - "step": 1822 - }, - { - "epoch": 33.14545454545455, - "grad_norm": 3.301032066345215, - "learning_rate": 3.374545454545455e-06, - "loss": 0.0732, - "step": 1823 - }, - { - "epoch": 33.163636363636364, - "grad_norm": 2.4173977375030518, - "learning_rate": 3.370909090909091e-06, - "loss": 0.058, - "step": 1824 - }, - { - "epoch": 33.18181818181818, - "grad_norm": 0.39347362518310547, - "learning_rate": 3.3672727272727275e-06, - "loss": 0.063, - "step": 1825 - }, - { - "epoch": 33.2, - "grad_norm": 0.6041854619979858, - "learning_rate": 3.3636363636363637e-06, - "loss": 0.0728, - "step": 1826 - }, - { - "epoch": 33.21818181818182, - "grad_norm": 3.837782144546509, - "learning_rate": 3.3600000000000004e-06, - "loss": 0.0736, - "step": 1827 - }, - { - "epoch": 33.236363636363635, - "grad_norm": 3.023122787475586, - "learning_rate": 3.3563636363636363e-06, - "loss": 0.0502, - "step": 1828 - }, - { - "epoch": 33.25454545454546, - "grad_norm": 3.0526113510131836, - "learning_rate": 3.352727272727273e-06, - "loss": 0.0548, - "step": 1829 - }, - { - "epoch": 33.27272727272727, - "grad_norm": 0.35683321952819824, - "learning_rate": 3.3490909090909097e-06, - "loss": 0.0617, - "step": 1830 - }, - { - "epoch": 33.29090909090909, - "grad_norm": 0.637310266494751, - "learning_rate": 3.3454545454545456e-06, - "loss": 0.0509, - "step": 1831 - }, - { - "epoch": 33.30909090909091, - "grad_norm": 2.3833470344543457, - "learning_rate": 3.3418181818181823e-06, - "loss": 0.0667, - "step": 1832 - }, - { - "epoch": 33.32727272727273, - "grad_norm": 3.276942729949951, - "learning_rate": 3.338181818181818e-06, - "loss": 0.0667, - "step": 1833 - }, - { - "epoch": 33.345454545454544, - "grad_norm": 2.6857059001922607, - "learning_rate": 3.334545454545455e-06, - "loss": 0.0468, - "step": 1834 - }, - { - "epoch": 33.36363636363637, - "grad_norm": 1.4197156429290771, - "learning_rate": 3.330909090909091e-06, - "loss": 0.0535, - "step": 1835 - }, - { - "epoch": 33.38181818181818, - "grad_norm": 1.7840412855148315, - "learning_rate": 3.3272727272727274e-06, - "loss": 0.0478, - "step": 1836 - }, - { - "epoch": 33.4, - "grad_norm": 1.698567271232605, - "learning_rate": 3.3236363636363637e-06, - "loss": 0.0666, - "step": 1837 - }, - { - "epoch": 33.41818181818182, - "grad_norm": 1.1378190517425537, - "learning_rate": 3.3200000000000004e-06, - "loss": 0.069, - "step": 1838 - }, - { - "epoch": 33.43636363636364, - "grad_norm": 0.46061190962791443, - "learning_rate": 3.3163636363636363e-06, - "loss": 0.0493, - "step": 1839 - }, - { - "epoch": 33.45454545454545, - "grad_norm": 1.9801716804504395, - "learning_rate": 3.312727272727273e-06, - "loss": 0.0723, - "step": 1840 - }, - { - "epoch": 33.472727272727276, - "grad_norm": 0.7456345558166504, - "learning_rate": 3.3090909090909097e-06, - "loss": 0.0604, - "step": 1841 - }, - { - "epoch": 33.49090909090909, - "grad_norm": 1.008011817932129, - "learning_rate": 3.3054545454545455e-06, - "loss": 0.0719, - "step": 1842 - }, - { - "epoch": 33.50909090909091, - "grad_norm": 0.5175331234931946, - "learning_rate": 3.3018181818181822e-06, - "loss": 0.0595, - "step": 1843 - }, - { - "epoch": 33.527272727272724, - "grad_norm": 1.1063215732574463, - "learning_rate": 3.2981818181818185e-06, - "loss": 0.0583, - "step": 1844 - }, - { - "epoch": 33.54545454545455, - "grad_norm": 1.6811844110488892, - "learning_rate": 3.294545454545455e-06, - "loss": 0.0683, - "step": 1845 - }, - { - "epoch": 33.56363636363636, - "grad_norm": 2.20259690284729, - "learning_rate": 3.290909090909091e-06, - "loss": 0.0562, - "step": 1846 - }, - { - "epoch": 33.58181818181818, - "grad_norm": 0.8648203015327454, - "learning_rate": 3.2872727272727274e-06, - "loss": 0.0666, - "step": 1847 - }, - { - "epoch": 33.6, - "grad_norm": 0.5249006152153015, - "learning_rate": 3.2836363636363637e-06, - "loss": 0.0571, - "step": 1848 - }, - { - "epoch": 33.61818181818182, - "grad_norm": 2.5403025150299072, - "learning_rate": 3.2800000000000004e-06, - "loss": 0.0573, - "step": 1849 - }, - { - "epoch": 33.63636363636363, - "grad_norm": 2.9431092739105225, - "learning_rate": 3.2763636363636362e-06, - "loss": 0.0661, - "step": 1850 - }, - { - "epoch": 33.654545454545456, - "grad_norm": 1.0883985757827759, - "learning_rate": 3.272727272727273e-06, - "loss": 0.0598, - "step": 1851 - }, - { - "epoch": 33.67272727272727, - "grad_norm": 0.241225466132164, - "learning_rate": 3.2690909090909096e-06, - "loss": 0.0465, - "step": 1852 - }, - { - "epoch": 33.69090909090909, - "grad_norm": 0.843015193939209, - "learning_rate": 3.2654545454545455e-06, - "loss": 0.0438, - "step": 1853 - }, - { - "epoch": 33.70909090909091, - "grad_norm": 1.8849526643753052, - "learning_rate": 3.261818181818182e-06, - "loss": 0.0448, - "step": 1854 - }, - { - "epoch": 33.72727272727273, - "grad_norm": 0.23897644877433777, - "learning_rate": 3.2581818181818185e-06, - "loss": 0.0733, - "step": 1855 - }, - { - "epoch": 33.74545454545454, - "grad_norm": 1.2283865213394165, - "learning_rate": 3.2545454545454548e-06, - "loss": 0.0674, - "step": 1856 - }, - { - "epoch": 33.763636363636365, - "grad_norm": 0.7539355754852295, - "learning_rate": 3.250909090909091e-06, - "loss": 0.0605, - "step": 1857 - }, - { - "epoch": 33.78181818181818, - "grad_norm": 0.7008834481239319, - "learning_rate": 3.2472727272727278e-06, - "loss": 0.0662, - "step": 1858 - }, - { - "epoch": 33.8, - "grad_norm": 1.2153860330581665, - "learning_rate": 3.2436363636363636e-06, - "loss": 0.0592, - "step": 1859 - }, - { - "epoch": 33.81818181818182, - "grad_norm": 1.2414518594741821, - "learning_rate": 3.2400000000000003e-06, - "loss": 0.0632, - "step": 1860 - }, - { - "epoch": 33.836363636363636, - "grad_norm": 1.6845035552978516, - "learning_rate": 3.236363636363636e-06, - "loss": 0.0698, - "step": 1861 - }, - { - "epoch": 33.85454545454545, - "grad_norm": 0.6251642107963562, - "learning_rate": 3.232727272727273e-06, - "loss": 0.0413, - "step": 1862 - }, - { - "epoch": 33.872727272727275, - "grad_norm": 1.7469240427017212, - "learning_rate": 3.2290909090909096e-06, - "loss": 0.0604, - "step": 1863 - }, - { - "epoch": 33.89090909090909, - "grad_norm": 0.6765925288200378, - "learning_rate": 3.2254545454545455e-06, - "loss": 0.0623, - "step": 1864 - }, - { - "epoch": 33.90909090909091, - "grad_norm": 1.073602557182312, - "learning_rate": 3.221818181818182e-06, - "loss": 0.0571, - "step": 1865 - }, - { - "epoch": 33.92727272727273, - "grad_norm": 0.20769499242305756, - "learning_rate": 3.2181818181818184e-06, - "loss": 0.0504, - "step": 1866 - }, - { - "epoch": 33.945454545454545, - "grad_norm": 1.3547552824020386, - "learning_rate": 3.2145454545454547e-06, - "loss": 0.0428, - "step": 1867 - }, - { - "epoch": 33.96363636363636, - "grad_norm": 0.7906819581985474, - "learning_rate": 3.210909090909091e-06, - "loss": 0.0565, - "step": 1868 - }, - { - "epoch": 33.981818181818184, - "grad_norm": 0.47991809248924255, - "learning_rate": 3.2072727272727277e-06, - "loss": 0.0498, - "step": 1869 - }, - { - "epoch": 34.0, - "grad_norm": 1.3131229877471924, - "learning_rate": 3.2036363636363636e-06, - "loss": 0.0554, - "step": 1870 - }, - { - "epoch": 34.0, - "eval_loss": 0.05629918724298477, - "eval_runtime": 8.8732, - "eval_samples_per_second": 613.643, - "eval_steps_per_second": 76.748, - "step": 1870 - }, - { - "epoch": 34.018181818181816, - "grad_norm": 0.29682832956314087, - "learning_rate": 3.2000000000000003e-06, - "loss": 0.0597, - "step": 1871 - }, - { - "epoch": 34.03636363636364, - "grad_norm": 0.7377836108207703, - "learning_rate": 3.196363636363637e-06, - "loss": 0.0746, - "step": 1872 - }, - { - "epoch": 34.054545454545455, - "grad_norm": 0.7535310983657837, - "learning_rate": 3.192727272727273e-06, - "loss": 0.0495, - "step": 1873 - }, - { - "epoch": 34.07272727272727, - "grad_norm": 0.9438050389289856, - "learning_rate": 3.1890909090909096e-06, - "loss": 0.0636, - "step": 1874 - }, - { - "epoch": 34.09090909090909, - "grad_norm": 0.20112298429012299, - "learning_rate": 3.1854545454545454e-06, - "loss": 0.0527, - "step": 1875 - }, - { - "epoch": 34.10909090909091, - "grad_norm": 0.8236849308013916, - "learning_rate": 3.181818181818182e-06, - "loss": 0.0682, - "step": 1876 - }, - { - "epoch": 34.127272727272725, - "grad_norm": 0.8581752181053162, - "learning_rate": 3.1781818181818184e-06, - "loss": 0.0608, - "step": 1877 - }, - { - "epoch": 34.14545454545455, - "grad_norm": 2.092151165008545, - "learning_rate": 3.1745454545454547e-06, - "loss": 0.0578, - "step": 1878 - }, - { - "epoch": 34.163636363636364, - "grad_norm": 1.3740298748016357, - "learning_rate": 3.170909090909091e-06, - "loss": 0.0547, - "step": 1879 - }, - { - "epoch": 34.18181818181818, - "grad_norm": 1.8301074504852295, - "learning_rate": 3.1672727272727277e-06, - "loss": 0.0759, - "step": 1880 - }, - { - "epoch": 34.2, - "grad_norm": 0.4698159992694855, - "learning_rate": 3.1636363636363635e-06, - "loss": 0.061, - "step": 1881 - }, - { - "epoch": 34.21818181818182, - "grad_norm": 1.4072908163070679, - "learning_rate": 3.1600000000000002e-06, - "loss": 0.0736, - "step": 1882 - }, - { - "epoch": 34.236363636363635, - "grad_norm": 2.7243475914001465, - "learning_rate": 3.156363636363637e-06, - "loss": 0.0581, - "step": 1883 - }, - { - "epoch": 34.25454545454546, - "grad_norm": 1.817193865776062, - "learning_rate": 3.152727272727273e-06, - "loss": 0.0585, - "step": 1884 - }, - { - "epoch": 34.27272727272727, - "grad_norm": 0.538291871547699, - "learning_rate": 3.1490909090909095e-06, - "loss": 0.0375, - "step": 1885 - }, - { - "epoch": 34.29090909090909, - "grad_norm": 2.0751898288726807, - "learning_rate": 3.145454545454546e-06, - "loss": 0.0541, - "step": 1886 - }, - { - "epoch": 34.30909090909091, - "grad_norm": 4.025467872619629, - "learning_rate": 3.141818181818182e-06, - "loss": 0.0764, - "step": 1887 - }, - { - "epoch": 34.32727272727273, - "grad_norm": 1.4230632781982422, - "learning_rate": 3.1381818181818184e-06, - "loss": 0.0492, - "step": 1888 - }, - { - "epoch": 34.345454545454544, - "grad_norm": 0.2260051667690277, - "learning_rate": 3.1345454545454546e-06, - "loss": 0.0654, - "step": 1889 - }, - { - "epoch": 34.36363636363637, - "grad_norm": 1.7454774379730225, - "learning_rate": 3.130909090909091e-06, - "loss": 0.0577, - "step": 1890 - }, - { - "epoch": 34.38181818181818, - "grad_norm": 1.02946937084198, - "learning_rate": 3.1272727272727276e-06, - "loss": 0.0526, - "step": 1891 - }, - { - "epoch": 34.4, - "grad_norm": 1.3477122783660889, - "learning_rate": 3.1236363636363635e-06, - "loss": 0.0609, - "step": 1892 - }, - { - "epoch": 34.41818181818182, - "grad_norm": 0.47269904613494873, - "learning_rate": 3.12e-06, - "loss": 0.0523, - "step": 1893 - }, - { - "epoch": 34.43636363636364, - "grad_norm": 0.6944325566291809, - "learning_rate": 3.116363636363637e-06, - "loss": 0.0462, - "step": 1894 - }, - { - "epoch": 34.45454545454545, - "grad_norm": 1.8110796213150024, - "learning_rate": 3.1127272727272728e-06, - "loss": 0.059, - "step": 1895 - }, - { - "epoch": 34.472727272727276, - "grad_norm": 0.8638215661048889, - "learning_rate": 3.1090909090909095e-06, - "loss": 0.0658, - "step": 1896 - }, - { - "epoch": 34.49090909090909, - "grad_norm": 0.8048219084739685, - "learning_rate": 3.1054545454545458e-06, - "loss": 0.0545, - "step": 1897 - }, - { - "epoch": 34.50909090909091, - "grad_norm": 2.000316858291626, - "learning_rate": 3.101818181818182e-06, - "loss": 0.0411, - "step": 1898 - }, - { - "epoch": 34.527272727272724, - "grad_norm": 1.1194860935211182, - "learning_rate": 3.0981818181818183e-06, - "loss": 0.0373, - "step": 1899 - }, - { - "epoch": 34.54545454545455, - "grad_norm": 0.22629152238368988, - "learning_rate": 3.094545454545455e-06, - "loss": 0.0519, - "step": 1900 - }, - { - "epoch": 34.56363636363636, - "grad_norm": 3.5331106185913086, - "learning_rate": 3.090909090909091e-06, - "loss": 0.0921, - "step": 1901 - }, - { - "epoch": 34.58181818181818, - "grad_norm": 3.109893560409546, - "learning_rate": 3.0872727272727276e-06, - "loss": 0.0778, - "step": 1902 - }, - { - "epoch": 34.6, - "grad_norm": 1.7383209466934204, - "learning_rate": 3.0836363636363635e-06, - "loss": 0.0687, - "step": 1903 - }, - { - "epoch": 34.61818181818182, - "grad_norm": 0.6748665571212769, - "learning_rate": 3.08e-06, - "loss": 0.0584, - "step": 1904 - }, - { - "epoch": 34.63636363636363, - "grad_norm": 1.5138969421386719, - "learning_rate": 3.076363636363637e-06, - "loss": 0.0413, - "step": 1905 - }, - { - "epoch": 34.654545454545456, - "grad_norm": 2.9789021015167236, - "learning_rate": 3.0727272727272727e-06, - "loss": 0.0782, - "step": 1906 - }, - { - "epoch": 34.67272727272727, - "grad_norm": 1.5702357292175293, - "learning_rate": 3.0690909090909094e-06, - "loss": 0.0614, - "step": 1907 - }, - { - "epoch": 34.69090909090909, - "grad_norm": 0.701515257358551, - "learning_rate": 3.0654545454545457e-06, - "loss": 0.067, - "step": 1908 - }, - { - "epoch": 34.70909090909091, - "grad_norm": 0.7018170356750488, - "learning_rate": 3.061818181818182e-06, - "loss": 0.0651, - "step": 1909 - }, - { - "epoch": 34.72727272727273, - "grad_norm": 1.0604552030563354, - "learning_rate": 3.0581818181818183e-06, - "loss": 0.0499, - "step": 1910 - }, - { - "epoch": 34.74545454545454, - "grad_norm": 2.6187002658843994, - "learning_rate": 3.054545454545455e-06, - "loss": 0.0665, - "step": 1911 - }, - { - "epoch": 34.763636363636365, - "grad_norm": 3.043273687362671, - "learning_rate": 3.050909090909091e-06, - "loss": 0.0596, - "step": 1912 - }, - { - "epoch": 34.78181818181818, - "grad_norm": 1.024476408958435, - "learning_rate": 3.0472727272727276e-06, - "loss": 0.0454, - "step": 1913 - }, - { - "epoch": 34.8, - "grad_norm": 0.39750298857688904, - "learning_rate": 3.0436363636363634e-06, - "loss": 0.0496, - "step": 1914 - }, - { - "epoch": 34.81818181818182, - "grad_norm": 1.344956398010254, - "learning_rate": 3.04e-06, - "loss": 0.065, - "step": 1915 - }, - { - "epoch": 34.836363636363636, - "grad_norm": 1.5824931859970093, - "learning_rate": 3.036363636363637e-06, - "loss": 0.0517, - "step": 1916 - }, - { - "epoch": 34.85454545454545, - "grad_norm": 0.8887394666671753, - "learning_rate": 3.0327272727272727e-06, - "loss": 0.0572, - "step": 1917 - }, - { - "epoch": 34.872727272727275, - "grad_norm": 0.9421762228012085, - "learning_rate": 3.0290909090909094e-06, - "loss": 0.0679, - "step": 1918 - }, - { - "epoch": 34.89090909090909, - "grad_norm": 0.8637071847915649, - "learning_rate": 3.0254545454545457e-06, - "loss": 0.0661, - "step": 1919 - }, - { - "epoch": 34.90909090909091, - "grad_norm": 0.48839643597602844, - "learning_rate": 3.021818181818182e-06, - "loss": 0.0453, - "step": 1920 - }, - { - "epoch": 34.92727272727273, - "grad_norm": 0.237548366189003, - "learning_rate": 3.0181818181818182e-06, - "loss": 0.0716, - "step": 1921 - }, - { - "epoch": 34.945454545454545, - "grad_norm": 1.106153964996338, - "learning_rate": 3.014545454545455e-06, - "loss": 0.064, - "step": 1922 - }, - { - "epoch": 34.96363636363636, - "grad_norm": 1.02920663356781, - "learning_rate": 3.010909090909091e-06, - "loss": 0.0454, - "step": 1923 - }, - { - "epoch": 34.981818181818184, - "grad_norm": 0.7243247628211975, - "learning_rate": 3.0072727272727275e-06, - "loss": 0.0547, - "step": 1924 - }, - { - "epoch": 35.0, - "grad_norm": 0.6379866600036621, - "learning_rate": 3.0036363636363642e-06, - "loss": 0.0792, - "step": 1925 - }, - { - "epoch": 35.0, - "eval_loss": 0.056581877171993256, - "eval_runtime": 9.48, - "eval_samples_per_second": 574.367, - "eval_steps_per_second": 71.835, - "step": 1925 - }, - { - "epoch": 35.018181818181816, - "grad_norm": 1.513132929801941, - "learning_rate": 3e-06, - "loss": 0.0713, - "step": 1926 - }, - { - "epoch": 35.03636363636364, - "grad_norm": 1.670650839805603, - "learning_rate": 2.9963636363636368e-06, - "loss": 0.0731, - "step": 1927 - }, - { - "epoch": 35.054545454545455, - "grad_norm": 0.3674888610839844, - "learning_rate": 2.9927272727272726e-06, - "loss": 0.0574, - "step": 1928 - }, - { - "epoch": 35.07272727272727, - "grad_norm": 1.106646180152893, - "learning_rate": 2.9890909090909093e-06, - "loss": 0.0529, - "step": 1929 - }, - { - "epoch": 35.09090909090909, - "grad_norm": 0.8308770060539246, - "learning_rate": 2.9854545454545456e-06, - "loss": 0.0589, - "step": 1930 - }, - { - "epoch": 35.10909090909091, - "grad_norm": 1.554348349571228, - "learning_rate": 2.981818181818182e-06, - "loss": 0.0439, - "step": 1931 - }, - { - "epoch": 35.127272727272725, - "grad_norm": 1.449559211730957, - "learning_rate": 2.978181818181818e-06, - "loss": 0.0464, - "step": 1932 - }, - { - "epoch": 35.14545454545455, - "grad_norm": 1.1925181150436401, - "learning_rate": 2.974545454545455e-06, - "loss": 0.0602, - "step": 1933 - }, - { - "epoch": 35.163636363636364, - "grad_norm": 0.819275975227356, - "learning_rate": 2.9709090909090908e-06, - "loss": 0.0562, - "step": 1934 - }, - { - "epoch": 35.18181818181818, - "grad_norm": 1.2199746370315552, - "learning_rate": 2.9672727272727275e-06, - "loss": 0.0611, - "step": 1935 - }, - { - "epoch": 35.2, - "grad_norm": 0.36668169498443604, - "learning_rate": 2.963636363636364e-06, - "loss": 0.0578, - "step": 1936 - }, - { - "epoch": 35.21818181818182, - "grad_norm": 0.5444760918617249, - "learning_rate": 2.96e-06, - "loss": 0.0562, - "step": 1937 - }, - { - "epoch": 35.236363636363635, - "grad_norm": 0.49110403656959534, - "learning_rate": 2.9563636363636367e-06, - "loss": 0.0605, - "step": 1938 - }, - { - "epoch": 35.25454545454546, - "grad_norm": 0.32998165488243103, - "learning_rate": 2.952727272727273e-06, - "loss": 0.0522, - "step": 1939 - }, - { - "epoch": 35.27272727272727, - "grad_norm": 0.31773263216018677, - "learning_rate": 2.9490909090909093e-06, - "loss": 0.0662, - "step": 1940 - }, - { - "epoch": 35.29090909090909, - "grad_norm": 0.3170473575592041, - "learning_rate": 2.9454545454545456e-06, - "loss": 0.0643, - "step": 1941 - }, - { - "epoch": 35.30909090909091, - "grad_norm": 0.2330743968486786, - "learning_rate": 2.941818181818182e-06, - "loss": 0.0621, - "step": 1942 - }, - { - "epoch": 35.32727272727273, - "grad_norm": 0.2344777137041092, - "learning_rate": 2.938181818181818e-06, - "loss": 0.0448, - "step": 1943 - }, - { - "epoch": 35.345454545454544, - "grad_norm": 0.5844224691390991, - "learning_rate": 2.934545454545455e-06, - "loss": 0.0581, - "step": 1944 - }, - { - "epoch": 35.36363636363637, - "grad_norm": 0.9478474855422974, - "learning_rate": 2.9309090909090907e-06, - "loss": 0.0631, - "step": 1945 - }, - { - "epoch": 35.38181818181818, - "grad_norm": 1.1213778257369995, - "learning_rate": 2.9272727272727274e-06, - "loss": 0.0527, - "step": 1946 - }, - { - "epoch": 35.4, - "grad_norm": 1.561166763305664, - "learning_rate": 2.923636363636364e-06, - "loss": 0.0548, - "step": 1947 - }, - { - "epoch": 35.41818181818182, - "grad_norm": 0.5596808195114136, - "learning_rate": 2.92e-06, - "loss": 0.0471, - "step": 1948 - }, - { - "epoch": 35.43636363636364, - "grad_norm": 0.2933318614959717, - "learning_rate": 2.9163636363636367e-06, - "loss": 0.0457, - "step": 1949 - }, - { - "epoch": 35.45454545454545, - "grad_norm": 0.4479397237300873, - "learning_rate": 2.912727272727273e-06, - "loss": 0.0683, - "step": 1950 - }, - { - "epoch": 35.472727272727276, - "grad_norm": 1.3687024116516113, - "learning_rate": 2.9090909090909093e-06, - "loss": 0.0592, - "step": 1951 - }, - { - "epoch": 35.49090909090909, - "grad_norm": 0.49250105023384094, - "learning_rate": 2.9054545454545456e-06, - "loss": 0.0421, - "step": 1952 - }, - { - "epoch": 35.50909090909091, - "grad_norm": 0.4911685585975647, - "learning_rate": 2.9018181818181823e-06, - "loss": 0.0645, - "step": 1953 - }, - { - "epoch": 35.527272727272724, - "grad_norm": 0.9318827986717224, - "learning_rate": 2.898181818181818e-06, - "loss": 0.0588, - "step": 1954 - }, - { - "epoch": 35.54545454545455, - "grad_norm": 1.2889800071716309, - "learning_rate": 2.894545454545455e-06, - "loss": 0.0649, - "step": 1955 - }, - { - "epoch": 35.56363636363636, - "grad_norm": 0.6601005792617798, - "learning_rate": 2.8909090909090907e-06, - "loss": 0.0657, - "step": 1956 - }, - { - "epoch": 35.58181818181818, - "grad_norm": 0.6882979869842529, - "learning_rate": 2.8872727272727274e-06, - "loss": 0.0552, - "step": 1957 - }, - { - "epoch": 35.6, - "grad_norm": 1.0565662384033203, - "learning_rate": 2.883636363636364e-06, - "loss": 0.0596, - "step": 1958 - }, - { - "epoch": 35.61818181818182, - "grad_norm": 1.5208719968795776, - "learning_rate": 2.88e-06, - "loss": 0.0606, - "step": 1959 - }, - { - "epoch": 35.63636363636363, - "grad_norm": 1.3108513355255127, - "learning_rate": 2.8763636363636367e-06, - "loss": 0.0443, - "step": 1960 - }, - { - "epoch": 35.654545454545456, - "grad_norm": 1.94813871383667, - "learning_rate": 2.872727272727273e-06, - "loss": 0.0539, - "step": 1961 - }, - { - "epoch": 35.67272727272727, - "grad_norm": 0.671384871006012, - "learning_rate": 2.8690909090909092e-06, - "loss": 0.0663, - "step": 1962 - }, - { - "epoch": 35.69090909090909, - "grad_norm": 1.1445895433425903, - "learning_rate": 2.8654545454545455e-06, - "loss": 0.0541, - "step": 1963 - }, - { - "epoch": 35.70909090909091, - "grad_norm": 0.5181692242622375, - "learning_rate": 2.8618181818181822e-06, - "loss": 0.084, - "step": 1964 - }, - { - "epoch": 35.72727272727273, - "grad_norm": 1.4044469594955444, - "learning_rate": 2.858181818181818e-06, - "loss": 0.0516, - "step": 1965 - }, - { - "epoch": 35.74545454545454, - "grad_norm": 0.3482029139995575, - "learning_rate": 2.8545454545454548e-06, - "loss": 0.0658, - "step": 1966 - }, - { - "epoch": 35.763636363636365, - "grad_norm": 0.23444029688835144, - "learning_rate": 2.8509090909090915e-06, - "loss": 0.0559, - "step": 1967 - }, - { - "epoch": 35.78181818181818, - "grad_norm": 1.2889277935028076, - "learning_rate": 2.8472727272727273e-06, - "loss": 0.0651, - "step": 1968 - }, - { - "epoch": 35.8, - "grad_norm": 1.3078091144561768, - "learning_rate": 2.843636363636364e-06, - "loss": 0.0637, - "step": 1969 - }, - { - "epoch": 35.81818181818182, - "grad_norm": 1.1007256507873535, - "learning_rate": 2.84e-06, - "loss": 0.0548, - "step": 1970 - }, - { - "epoch": 35.836363636363636, - "grad_norm": 0.7665208578109741, - "learning_rate": 2.8363636363636366e-06, - "loss": 0.0661, - "step": 1971 - }, - { - "epoch": 35.85454545454545, - "grad_norm": 0.6389698386192322, - "learning_rate": 2.832727272727273e-06, - "loss": 0.0486, - "step": 1972 - }, - { - "epoch": 35.872727272727275, - "grad_norm": 1.2304587364196777, - "learning_rate": 2.829090909090909e-06, - "loss": 0.0663, - "step": 1973 - }, - { - "epoch": 35.89090909090909, - "grad_norm": 1.1657696962356567, - "learning_rate": 2.8254545454545455e-06, - "loss": 0.0737, - "step": 1974 - }, - { - "epoch": 35.90909090909091, - "grad_norm": 0.18609900772571564, - "learning_rate": 2.821818181818182e-06, - "loss": 0.044, - "step": 1975 - }, - { - "epoch": 35.92727272727273, - "grad_norm": 0.5427846908569336, - "learning_rate": 2.818181818181818e-06, - "loss": 0.0491, - "step": 1976 - }, - { - "epoch": 35.945454545454545, - "grad_norm": 0.3882474899291992, - "learning_rate": 2.8145454545454547e-06, - "loss": 0.0637, - "step": 1977 - }, - { - "epoch": 35.96363636363636, - "grad_norm": 0.3063582181930542, - "learning_rate": 2.8109090909090914e-06, - "loss": 0.0652, - "step": 1978 - }, - { - "epoch": 35.981818181818184, - "grad_norm": 1.761152744293213, - "learning_rate": 2.8072727272727273e-06, - "loss": 0.0652, - "step": 1979 - }, - { - "epoch": 36.0, - "grad_norm": 2.367624282836914, - "learning_rate": 2.803636363636364e-06, - "loss": 0.0486, - "step": 1980 - }, - { - "epoch": 36.0, - "eval_loss": 0.05606954172253609, - "eval_runtime": 9.2677, - "eval_samples_per_second": 587.526, - "eval_steps_per_second": 73.481, - "step": 1980 - }, - { - "epoch": 36.018181818181816, - "grad_norm": 1.1724528074264526, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.0576, - "step": 1981 - }, - { - "epoch": 36.03636363636364, - "grad_norm": 0.5771242380142212, - "learning_rate": 2.7963636363636366e-06, - "loss": 0.054, - "step": 1982 - }, - { - "epoch": 36.054545454545455, - "grad_norm": 0.30733516812324524, - "learning_rate": 2.792727272727273e-06, - "loss": 0.0449, - "step": 1983 - }, - { - "epoch": 36.07272727272727, - "grad_norm": 1.0500391721725464, - "learning_rate": 2.789090909090909e-06, - "loss": 0.0587, - "step": 1984 - }, - { - "epoch": 36.09090909090909, - "grad_norm": 1.0554752349853516, - "learning_rate": 2.7854545454545454e-06, - "loss": 0.0658, - "step": 1985 - }, - { - "epoch": 36.10909090909091, - "grad_norm": 1.2704918384552002, - "learning_rate": 2.781818181818182e-06, - "loss": 0.0628, - "step": 1986 - }, - { - "epoch": 36.127272727272725, - "grad_norm": 0.5215453505516052, - "learning_rate": 2.778181818181818e-06, - "loss": 0.038, - "step": 1987 - }, - { - "epoch": 36.14545454545455, - "grad_norm": 0.6701186299324036, - "learning_rate": 2.7745454545454547e-06, - "loss": 0.0583, - "step": 1988 - }, - { - "epoch": 36.163636363636364, - "grad_norm": 0.39379340410232544, - "learning_rate": 2.7709090909090914e-06, - "loss": 0.0564, - "step": 1989 - }, - { - "epoch": 36.18181818181818, - "grad_norm": 0.2908765971660614, - "learning_rate": 2.7672727272727273e-06, - "loss": 0.0561, - "step": 1990 - }, - { - "epoch": 36.2, - "grad_norm": 0.720541775226593, - "learning_rate": 2.763636363636364e-06, - "loss": 0.0612, - "step": 1991 - }, - { - "epoch": 36.21818181818182, - "grad_norm": 0.2868873178958893, - "learning_rate": 2.7600000000000003e-06, - "loss": 0.0691, - "step": 1992 - }, - { - "epoch": 36.236363636363635, - "grad_norm": 1.4934320449829102, - "learning_rate": 2.7563636363636365e-06, - "loss": 0.0595, - "step": 1993 - }, - { - "epoch": 36.25454545454546, - "grad_norm": 0.35772159695625305, - "learning_rate": 2.752727272727273e-06, - "loss": 0.0569, - "step": 1994 - }, - { - "epoch": 36.27272727272727, - "grad_norm": 0.7900968790054321, - "learning_rate": 2.7490909090909095e-06, - "loss": 0.0719, - "step": 1995 - }, - { - "epoch": 36.29090909090909, - "grad_norm": 0.9167521595954895, - "learning_rate": 2.7454545454545454e-06, - "loss": 0.0569, - "step": 1996 - }, - { - "epoch": 36.30909090909091, - "grad_norm": 0.22400321066379547, - "learning_rate": 2.741818181818182e-06, - "loss": 0.0643, - "step": 1997 - }, - { - "epoch": 36.32727272727273, - "grad_norm": 1.1621659994125366, - "learning_rate": 2.738181818181818e-06, - "loss": 0.0647, - "step": 1998 - }, - { - "epoch": 36.345454545454544, - "grad_norm": 0.3633122146129608, - "learning_rate": 2.7345454545454547e-06, - "loss": 0.0744, - "step": 1999 - }, - { - "epoch": 36.36363636363637, - "grad_norm": 0.19546788930892944, - "learning_rate": 2.7309090909090914e-06, - "loss": 0.0629, - "step": 2000 - }, - { - "epoch": 36.38181818181818, - "grad_norm": 1.1438462734222412, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.0579, - "step": 2001 - }, - { - "epoch": 36.4, - "grad_norm": 0.4075356721878052, - "learning_rate": 2.723636363636364e-06, - "loss": 0.0554, - "step": 2002 - }, - { - "epoch": 36.41818181818182, - "grad_norm": 0.5509101748466492, - "learning_rate": 2.7200000000000002e-06, - "loss": 0.066, - "step": 2003 - }, - { - "epoch": 36.43636363636364, - "grad_norm": 1.6690434217453003, - "learning_rate": 2.7163636363636365e-06, - "loss": 0.048, - "step": 2004 - }, - { - "epoch": 36.45454545454545, - "grad_norm": 2.3762667179107666, - "learning_rate": 2.7127272727272728e-06, - "loss": 0.0664, - "step": 2005 - }, - { - "epoch": 36.472727272727276, - "grad_norm": 0.39946484565734863, - "learning_rate": 2.7090909090909095e-06, - "loss": 0.0812, - "step": 2006 - }, - { - "epoch": 36.49090909090909, - "grad_norm": 1.0143463611602783, - "learning_rate": 2.7054545454545453e-06, - "loss": 0.0743, - "step": 2007 - }, - { - "epoch": 36.50909090909091, - "grad_norm": 0.865645706653595, - "learning_rate": 2.701818181818182e-06, - "loss": 0.0535, - "step": 2008 - }, - { - "epoch": 36.527272727272724, - "grad_norm": 0.42737045884132385, - "learning_rate": 2.6981818181818188e-06, - "loss": 0.0581, - "step": 2009 - }, - { - "epoch": 36.54545454545455, - "grad_norm": 0.7565283179283142, - "learning_rate": 2.6945454545454546e-06, - "loss": 0.0507, - "step": 2010 - }, - { - "epoch": 36.56363636363636, - "grad_norm": 1.0424872636795044, - "learning_rate": 2.6909090909090913e-06, - "loss": 0.0505, - "step": 2011 - }, - { - "epoch": 36.58181818181818, - "grad_norm": 0.23233893513679504, - "learning_rate": 2.687272727272727e-06, - "loss": 0.0473, - "step": 2012 - }, - { - "epoch": 36.6, - "grad_norm": 0.3067811131477356, - "learning_rate": 2.683636363636364e-06, - "loss": 0.0654, - "step": 2013 - }, - { - "epoch": 36.61818181818182, - "grad_norm": 1.83575439453125, - "learning_rate": 2.68e-06, - "loss": 0.0523, - "step": 2014 - }, - { - "epoch": 36.63636363636363, - "grad_norm": 1.4918915033340454, - "learning_rate": 2.6763636363636365e-06, - "loss": 0.0584, - "step": 2015 - }, - { - "epoch": 36.654545454545456, - "grad_norm": 1.6397899389266968, - "learning_rate": 2.6727272727272727e-06, - "loss": 0.0538, - "step": 2016 - }, - { - "epoch": 36.67272727272727, - "grad_norm": 2.1939306259155273, - "learning_rate": 2.6690909090909094e-06, - "loss": 0.0625, - "step": 2017 - }, - { - "epoch": 36.69090909090909, - "grad_norm": 0.9158309102058411, - "learning_rate": 2.6654545454545453e-06, - "loss": 0.0539, - "step": 2018 - }, - { - "epoch": 36.70909090909091, - "grad_norm": 0.8432338833808899, - "learning_rate": 2.661818181818182e-06, - "loss": 0.0574, - "step": 2019 - }, - { - "epoch": 36.72727272727273, - "grad_norm": 0.7257166504859924, - "learning_rate": 2.6581818181818187e-06, - "loss": 0.0456, - "step": 2020 - }, - { - "epoch": 36.74545454545454, - "grad_norm": 0.4642726480960846, - "learning_rate": 2.6545454545454546e-06, - "loss": 0.0485, - "step": 2021 - }, - { - "epoch": 36.763636363636365, - "grad_norm": 2.6390397548675537, - "learning_rate": 2.6509090909090913e-06, - "loss": 0.0695, - "step": 2022 - }, - { - "epoch": 36.78181818181818, - "grad_norm": 0.8048014640808105, - "learning_rate": 2.647272727272727e-06, - "loss": 0.0521, - "step": 2023 - }, - { - "epoch": 36.8, - "grad_norm": 0.8623517751693726, - "learning_rate": 2.643636363636364e-06, - "loss": 0.0603, - "step": 2024 - }, - { - "epoch": 36.81818181818182, - "grad_norm": 1.4917877912521362, - "learning_rate": 2.64e-06, - "loss": 0.0825, - "step": 2025 - }, - { - "epoch": 36.836363636363636, - "grad_norm": 1.926193118095398, - "learning_rate": 2.6363636363636364e-06, - "loss": 0.0708, - "step": 2026 - }, - { - "epoch": 36.85454545454545, - "grad_norm": 1.7792209386825562, - "learning_rate": 2.6327272727272727e-06, - "loss": 0.0475, - "step": 2027 - }, - { - "epoch": 36.872727272727275, - "grad_norm": 0.6991704702377319, - "learning_rate": 2.6290909090909094e-06, - "loss": 0.0487, - "step": 2028 - }, - { - "epoch": 36.89090909090909, - "grad_norm": 1.669973611831665, - "learning_rate": 2.6254545454545453e-06, - "loss": 0.0565, - "step": 2029 - }, - { - "epoch": 36.90909090909091, - "grad_norm": 1.0429937839508057, - "learning_rate": 2.621818181818182e-06, - "loss": 0.0659, - "step": 2030 - }, - { - "epoch": 36.92727272727273, - "grad_norm": 0.39590755105018616, - "learning_rate": 2.6181818181818187e-06, - "loss": 0.0532, - "step": 2031 - }, - { - "epoch": 36.945454545454545, - "grad_norm": 0.8879863619804382, - "learning_rate": 2.6145454545454545e-06, - "loss": 0.0563, - "step": 2032 - }, - { - "epoch": 36.96363636363636, - "grad_norm": 1.4540802240371704, - "learning_rate": 2.6109090909090912e-06, - "loss": 0.0472, - "step": 2033 - }, - { - "epoch": 36.981818181818184, - "grad_norm": 0.5362735390663147, - "learning_rate": 2.6072727272727275e-06, - "loss": 0.0606, - "step": 2034 - }, - { - "epoch": 37.0, - "grad_norm": 0.6486362814903259, - "learning_rate": 2.603636363636364e-06, - "loss": 0.0714, - "step": 2035 - }, - { - "epoch": 37.0, - "eval_loss": 0.05601733922958374, - "eval_runtime": 9.2175, - "eval_samples_per_second": 590.726, - "eval_steps_per_second": 73.881, - "step": 2035 - }, - { - "epoch": 37.018181818181816, - "grad_norm": 1.33858060836792, - "learning_rate": 2.6e-06, - "loss": 0.0544, - "step": 2036 - }, - { - "epoch": 37.03636363636364, - "grad_norm": 0.1936122328042984, - "learning_rate": 2.5963636363636364e-06, - "loss": 0.0437, - "step": 2037 - }, - { - "epoch": 37.054545454545455, - "grad_norm": 1.8591914176940918, - "learning_rate": 2.5927272727272727e-06, - "loss": 0.0692, - "step": 2038 - }, - { - "epoch": 37.07272727272727, - "grad_norm": 0.5379475355148315, - "learning_rate": 2.5890909090909094e-06, - "loss": 0.0787, - "step": 2039 - }, - { - "epoch": 37.09090909090909, - "grad_norm": 0.26854798197746277, - "learning_rate": 2.5854545454545456e-06, - "loss": 0.0513, - "step": 2040 - }, - { - "epoch": 37.10909090909091, - "grad_norm": 1.883224368095398, - "learning_rate": 2.581818181818182e-06, - "loss": 0.0614, - "step": 2041 - }, - { - "epoch": 37.127272727272725, - "grad_norm": 0.8590031862258911, - "learning_rate": 2.5781818181818186e-06, - "loss": 0.0584, - "step": 2042 - }, - { - "epoch": 37.14545454545455, - "grad_norm": 0.3823501467704773, - "learning_rate": 2.5745454545454545e-06, - "loss": 0.0532, - "step": 2043 - }, - { - "epoch": 37.163636363636364, - "grad_norm": 0.23391024768352509, - "learning_rate": 2.570909090909091e-06, - "loss": 0.0605, - "step": 2044 - }, - { - "epoch": 37.18181818181818, - "grad_norm": 2.3636350631713867, - "learning_rate": 2.5672727272727275e-06, - "loss": 0.0706, - "step": 2045 - }, - { - "epoch": 37.2, - "grad_norm": 0.4846961200237274, - "learning_rate": 2.5636363636363638e-06, - "loss": 0.0374, - "step": 2046 - }, - { - "epoch": 37.21818181818182, - "grad_norm": 0.24729807674884796, - "learning_rate": 2.56e-06, - "loss": 0.0426, - "step": 2047 - }, - { - "epoch": 37.236363636363635, - "grad_norm": 0.7498653531074524, - "learning_rate": 2.5563636363636368e-06, - "loss": 0.0711, - "step": 2048 - }, - { - "epoch": 37.25454545454546, - "grad_norm": 0.6162502765655518, - "learning_rate": 2.552727272727273e-06, - "loss": 0.0714, - "step": 2049 - }, - { - "epoch": 37.27272727272727, - "grad_norm": 1.0182005167007446, - "learning_rate": 2.5490909090909093e-06, - "loss": 0.0562, - "step": 2050 - }, - { - "epoch": 37.29090909090909, - "grad_norm": 0.8939874768257141, - "learning_rate": 2.5454545454545456e-06, - "loss": 0.0645, - "step": 2051 - }, - { - "epoch": 37.30909090909091, - "grad_norm": 0.48110899329185486, - "learning_rate": 2.541818181818182e-06, - "loss": 0.0535, - "step": 2052 - }, - { - "epoch": 37.32727272727273, - "grad_norm": 1.0042262077331543, - "learning_rate": 2.5381818181818186e-06, - "loss": 0.0592, - "step": 2053 - }, - { - "epoch": 37.345454545454544, - "grad_norm": 0.8531234264373779, - "learning_rate": 2.5345454545454545e-06, - "loss": 0.0532, - "step": 2054 - }, - { - "epoch": 37.36363636363637, - "grad_norm": 2.6332900524139404, - "learning_rate": 2.530909090909091e-06, - "loss": 0.0611, - "step": 2055 - }, - { - "epoch": 37.38181818181818, - "grad_norm": 3.1963937282562256, - "learning_rate": 2.5272727272727274e-06, - "loss": 0.0601, - "step": 2056 - }, - { - "epoch": 37.4, - "grad_norm": 1.8138035535812378, - "learning_rate": 2.5236363636363637e-06, - "loss": 0.0705, - "step": 2057 - }, - { - "epoch": 37.41818181818182, - "grad_norm": 0.46668437123298645, - "learning_rate": 2.52e-06, - "loss": 0.0647, - "step": 2058 - }, - { - "epoch": 37.43636363636364, - "grad_norm": 1.0093685388565063, - "learning_rate": 2.5163636363636367e-06, - "loss": 0.0451, - "step": 2059 - }, - { - "epoch": 37.45454545454545, - "grad_norm": 2.0076863765716553, - "learning_rate": 2.512727272727273e-06, - "loss": 0.0517, - "step": 2060 - }, - { - "epoch": 37.472727272727276, - "grad_norm": 2.0239219665527344, - "learning_rate": 2.5090909090909093e-06, - "loss": 0.0537, - "step": 2061 - }, - { - "epoch": 37.49090909090909, - "grad_norm": 0.7722144722938538, - "learning_rate": 2.505454545454546e-06, - "loss": 0.0462, - "step": 2062 - }, - { - "epoch": 37.50909090909091, - "grad_norm": 0.18569202721118927, - "learning_rate": 2.501818181818182e-06, - "loss": 0.0585, - "step": 2063 - }, - { - "epoch": 37.527272727272724, - "grad_norm": 1.4299477338790894, - "learning_rate": 2.4981818181818186e-06, - "loss": 0.0677, - "step": 2064 - }, - { - "epoch": 37.54545454545455, - "grad_norm": 1.2111817598342896, - "learning_rate": 2.494545454545455e-06, - "loss": 0.0465, - "step": 2065 - }, - { - "epoch": 37.56363636363636, - "grad_norm": 1.0859261751174927, - "learning_rate": 2.490909090909091e-06, - "loss": 0.0505, - "step": 2066 - }, - { - "epoch": 37.58181818181818, - "grad_norm": 0.42662566900253296, - "learning_rate": 2.4872727272727274e-06, - "loss": 0.0628, - "step": 2067 - }, - { - "epoch": 37.6, - "grad_norm": 1.5650113821029663, - "learning_rate": 2.4836363636363637e-06, - "loss": 0.0577, - "step": 2068 - }, - { - "epoch": 37.61818181818182, - "grad_norm": 0.5153284072875977, - "learning_rate": 2.4800000000000004e-06, - "loss": 0.0839, - "step": 2069 - }, - { - "epoch": 37.63636363636363, - "grad_norm": 2.692976474761963, - "learning_rate": 2.4763636363636367e-06, - "loss": 0.0519, - "step": 2070 - }, - { - "epoch": 37.654545454545456, - "grad_norm": 0.8469442129135132, - "learning_rate": 2.472727272727273e-06, - "loss": 0.0648, - "step": 2071 - }, - { - "epoch": 37.67272727272727, - "grad_norm": 0.22569584846496582, - "learning_rate": 2.4690909090909092e-06, - "loss": 0.0663, - "step": 2072 - }, - { - "epoch": 37.69090909090909, - "grad_norm": 0.5174769163131714, - "learning_rate": 2.4654545454545455e-06, - "loss": 0.0434, - "step": 2073 - }, - { - "epoch": 37.70909090909091, - "grad_norm": 0.7292089462280273, - "learning_rate": 2.461818181818182e-06, - "loss": 0.0473, - "step": 2074 - }, - { - "epoch": 37.72727272727273, - "grad_norm": 1.2575974464416504, - "learning_rate": 2.4581818181818185e-06, - "loss": 0.0632, - "step": 2075 - }, - { - "epoch": 37.74545454545454, - "grad_norm": 1.4400620460510254, - "learning_rate": 2.454545454545455e-06, - "loss": 0.0754, - "step": 2076 - }, - { - "epoch": 37.763636363636365, - "grad_norm": 0.21405689418315887, - "learning_rate": 2.450909090909091e-06, - "loss": 0.0524, - "step": 2077 - }, - { - "epoch": 37.78181818181818, - "grad_norm": 1.5354593992233276, - "learning_rate": 2.4472727272727274e-06, - "loss": 0.0461, - "step": 2078 - }, - { - "epoch": 37.8, - "grad_norm": 0.9241748452186584, - "learning_rate": 2.443636363636364e-06, - "loss": 0.0598, - "step": 2079 - }, - { - "epoch": 37.81818181818182, - "grad_norm": 1.7316398620605469, - "learning_rate": 2.4400000000000004e-06, - "loss": 0.0569, - "step": 2080 - }, - { - "epoch": 37.836363636363636, - "grad_norm": 0.29837459325790405, - "learning_rate": 2.4363636363636366e-06, - "loss": 0.061, - "step": 2081 - }, - { - "epoch": 37.85454545454545, - "grad_norm": 0.9876394867897034, - "learning_rate": 2.432727272727273e-06, - "loss": 0.0568, - "step": 2082 - }, - { - "epoch": 37.872727272727275, - "grad_norm": 0.45546597242355347, - "learning_rate": 2.429090909090909e-06, - "loss": 0.0462, - "step": 2083 - }, - { - "epoch": 37.89090909090909, - "grad_norm": 2.0097434520721436, - "learning_rate": 2.4254545454545455e-06, - "loss": 0.0709, - "step": 2084 - }, - { - "epoch": 37.90909090909091, - "grad_norm": 0.23392052948474884, - "learning_rate": 2.421818181818182e-06, - "loss": 0.0529, - "step": 2085 - }, - { - "epoch": 37.92727272727273, - "grad_norm": 0.970085859298706, - "learning_rate": 2.4181818181818185e-06, - "loss": 0.0571, - "step": 2086 - }, - { - "epoch": 37.945454545454545, - "grad_norm": 1.624928593635559, - "learning_rate": 2.4145454545454548e-06, - "loss": 0.0558, - "step": 2087 - }, - { - "epoch": 37.96363636363636, - "grad_norm": 1.6436820030212402, - "learning_rate": 2.410909090909091e-06, - "loss": 0.0522, - "step": 2088 - }, - { - "epoch": 37.981818181818184, - "grad_norm": 1.485972285270691, - "learning_rate": 2.4072727272727277e-06, - "loss": 0.086, - "step": 2089 - }, - { - "epoch": 38.0, - "grad_norm": 0.7091927528381348, - "learning_rate": 2.403636363636364e-06, - "loss": 0.0806, - "step": 2090 - }, - { - "epoch": 38.0, - "eval_loss": 0.05588439479470253, - "eval_runtime": 8.3723, - "eval_samples_per_second": 650.356, - "eval_steps_per_second": 81.339, - "step": 2090 - }, - { - "epoch": 38.018181818181816, - "grad_norm": 0.43934208154678345, - "learning_rate": 2.4000000000000003e-06, - "loss": 0.0687, - "step": 2091 - }, - { - "epoch": 38.03636363636364, - "grad_norm": 0.7572969198226929, - "learning_rate": 2.3963636363636366e-06, - "loss": 0.0504, - "step": 2092 - }, - { - "epoch": 38.054545454545455, - "grad_norm": 1.0364779233932495, - "learning_rate": 2.392727272727273e-06, - "loss": 0.0706, - "step": 2093 - }, - { - "epoch": 38.07272727272727, - "grad_norm": 0.7277921438217163, - "learning_rate": 2.389090909090909e-06, - "loss": 0.0572, - "step": 2094 - }, - { - "epoch": 38.09090909090909, - "grad_norm": 1.7247065305709839, - "learning_rate": 2.3854545454545454e-06, - "loss": 0.0432, - "step": 2095 - }, - { - "epoch": 38.10909090909091, - "grad_norm": 1.6208046674728394, - "learning_rate": 2.381818181818182e-06, - "loss": 0.0617, - "step": 2096 - }, - { - "epoch": 38.127272727272725, - "grad_norm": 0.9663704633712769, - "learning_rate": 2.3781818181818184e-06, - "loss": 0.0632, - "step": 2097 - }, - { - "epoch": 38.14545454545455, - "grad_norm": 1.391657829284668, - "learning_rate": 2.3745454545454547e-06, - "loss": 0.0663, - "step": 2098 - }, - { - "epoch": 38.163636363636364, - "grad_norm": 0.7732747793197632, - "learning_rate": 2.3709090909090914e-06, - "loss": 0.0507, - "step": 2099 - }, - { - "epoch": 38.18181818181818, - "grad_norm": 1.0083469152450562, - "learning_rate": 2.3672727272727277e-06, - "loss": 0.056, - "step": 2100 - }, - { - "epoch": 38.2, - "grad_norm": 0.9722573161125183, - "learning_rate": 2.363636363636364e-06, - "loss": 0.0496, - "step": 2101 - }, - { - "epoch": 38.21818181818182, - "grad_norm": 0.2705049514770508, - "learning_rate": 2.3600000000000003e-06, - "loss": 0.0737, - "step": 2102 - }, - { - "epoch": 38.236363636363635, - "grad_norm": 0.31200969219207764, - "learning_rate": 2.3563636363636366e-06, - "loss": 0.0666, - "step": 2103 - }, - { - "epoch": 38.25454545454546, - "grad_norm": 0.39989346265792847, - "learning_rate": 2.352727272727273e-06, - "loss": 0.0572, - "step": 2104 - }, - { - "epoch": 38.27272727272727, - "grad_norm": 0.7953023910522461, - "learning_rate": 2.349090909090909e-06, - "loss": 0.0717, - "step": 2105 - }, - { - "epoch": 38.29090909090909, - "grad_norm": 1.3797898292541504, - "learning_rate": 2.345454545454546e-06, - "loss": 0.0551, - "step": 2106 - }, - { - "epoch": 38.30909090909091, - "grad_norm": 2.0723578929901123, - "learning_rate": 2.341818181818182e-06, - "loss": 0.0603, - "step": 2107 - }, - { - "epoch": 38.32727272727273, - "grad_norm": 1.8820576667785645, - "learning_rate": 2.3381818181818184e-06, - "loss": 0.0597, - "step": 2108 - }, - { - "epoch": 38.345454545454544, - "grad_norm": 0.27353787422180176, - "learning_rate": 2.3345454545454547e-06, - "loss": 0.0606, - "step": 2109 - }, - { - "epoch": 38.36363636363637, - "grad_norm": 0.5669616460800171, - "learning_rate": 2.3309090909090914e-06, - "loss": 0.0505, - "step": 2110 - }, - { - "epoch": 38.38181818181818, - "grad_norm": 2.8208119869232178, - "learning_rate": 2.3272727272727277e-06, - "loss": 0.0797, - "step": 2111 - }, - { - "epoch": 38.4, - "grad_norm": 0.4144856333732605, - "learning_rate": 2.323636363636364e-06, - "loss": 0.0559, - "step": 2112 - }, - { - "epoch": 38.41818181818182, - "grad_norm": 1.2785667181015015, - "learning_rate": 2.3200000000000002e-06, - "loss": 0.0663, - "step": 2113 - }, - { - "epoch": 38.43636363636364, - "grad_norm": 0.3829456865787506, - "learning_rate": 2.3163636363636365e-06, - "loss": 0.039, - "step": 2114 - }, - { - "epoch": 38.45454545454545, - "grad_norm": 0.5801783204078674, - "learning_rate": 2.312727272727273e-06, - "loss": 0.0668, - "step": 2115 - }, - { - "epoch": 38.472727272727276, - "grad_norm": 0.22602400183677673, - "learning_rate": 2.309090909090909e-06, - "loss": 0.0596, - "step": 2116 - }, - { - "epoch": 38.49090909090909, - "grad_norm": 2.340325355529785, - "learning_rate": 2.305454545454546e-06, - "loss": 0.0498, - "step": 2117 - }, - { - "epoch": 38.50909090909091, - "grad_norm": 2.180292844772339, - "learning_rate": 2.301818181818182e-06, - "loss": 0.0651, - "step": 2118 - }, - { - "epoch": 38.527272727272724, - "grad_norm": 1.6077604293823242, - "learning_rate": 2.2981818181818184e-06, - "loss": 0.0376, - "step": 2119 - }, - { - "epoch": 38.54545454545455, - "grad_norm": 1.5401383638381958, - "learning_rate": 2.294545454545455e-06, - "loss": 0.06, - "step": 2120 - }, - { - "epoch": 38.56363636363636, - "grad_norm": 1.1375362873077393, - "learning_rate": 2.2909090909090913e-06, - "loss": 0.0659, - "step": 2121 - }, - { - "epoch": 38.58181818181818, - "grad_norm": 0.3750540316104889, - "learning_rate": 2.2872727272727276e-06, - "loss": 0.062, - "step": 2122 - }, - { - "epoch": 38.6, - "grad_norm": 0.31117820739746094, - "learning_rate": 2.283636363636364e-06, - "loss": 0.0599, - "step": 2123 - }, - { - "epoch": 38.61818181818182, - "grad_norm": 1.285040020942688, - "learning_rate": 2.28e-06, - "loss": 0.0581, - "step": 2124 - }, - { - "epoch": 38.63636363636363, - "grad_norm": 0.3734014928340912, - "learning_rate": 2.2763636363636365e-06, - "loss": 0.0667, - "step": 2125 - }, - { - "epoch": 38.654545454545456, - "grad_norm": 2.9768121242523193, - "learning_rate": 2.2727272727272728e-06, - "loss": 0.0586, - "step": 2126 - }, - { - "epoch": 38.67272727272727, - "grad_norm": 2.274249792098999, - "learning_rate": 2.2690909090909095e-06, - "loss": 0.0617, - "step": 2127 - }, - { - "epoch": 38.69090909090909, - "grad_norm": 0.5971135497093201, - "learning_rate": 2.2654545454545457e-06, - "loss": 0.0604, - "step": 2128 - }, - { - "epoch": 38.70909090909091, - "grad_norm": 0.379792183637619, - "learning_rate": 2.261818181818182e-06, - "loss": 0.0418, - "step": 2129 - }, - { - "epoch": 38.72727272727273, - "grad_norm": 1.7197226285934448, - "learning_rate": 2.2581818181818183e-06, - "loss": 0.0528, - "step": 2130 - }, - { - "epoch": 38.74545454545454, - "grad_norm": 0.8475543260574341, - "learning_rate": 2.254545454545455e-06, - "loss": 0.062, - "step": 2131 - }, - { - "epoch": 38.763636363636365, - "grad_norm": 0.5192212462425232, - "learning_rate": 2.2509090909090913e-06, - "loss": 0.0663, - "step": 2132 - }, - { - "epoch": 38.78181818181818, - "grad_norm": 0.5192553997039795, - "learning_rate": 2.2472727272727276e-06, - "loss": 0.0645, - "step": 2133 - }, - { - "epoch": 38.8, - "grad_norm": 0.24954499304294586, - "learning_rate": 2.243636363636364e-06, - "loss": 0.0557, - "step": 2134 - }, - { - "epoch": 38.81818181818182, - "grad_norm": 1.3801172971725464, - "learning_rate": 2.24e-06, - "loss": 0.0487, - "step": 2135 - }, - { - "epoch": 38.836363636363636, - "grad_norm": 0.30934515595436096, - "learning_rate": 2.2363636363636364e-06, - "loss": 0.0558, - "step": 2136 - }, - { - "epoch": 38.85454545454545, - "grad_norm": 0.6331178545951843, - "learning_rate": 2.2327272727272727e-06, - "loss": 0.0492, - "step": 2137 - }, - { - "epoch": 38.872727272727275, - "grad_norm": 0.4495188593864441, - "learning_rate": 2.2290909090909094e-06, - "loss": 0.0531, - "step": 2138 - }, - { - "epoch": 38.89090909090909, - "grad_norm": 1.202928066253662, - "learning_rate": 2.2254545454545457e-06, - "loss": 0.0606, - "step": 2139 - }, - { - "epoch": 38.90909090909091, - "grad_norm": 0.5935454964637756, - "learning_rate": 2.221818181818182e-06, - "loss": 0.0607, - "step": 2140 - }, - { - "epoch": 38.92727272727273, - "grad_norm": 1.594401240348816, - "learning_rate": 2.2181818181818187e-06, - "loss": 0.0571, - "step": 2141 - }, - { - "epoch": 38.945454545454545, - "grad_norm": 0.965109646320343, - "learning_rate": 2.214545454545455e-06, - "loss": 0.0829, - "step": 2142 - }, - { - "epoch": 38.96363636363636, - "grad_norm": 0.6481137275695801, - "learning_rate": 2.2109090909090913e-06, - "loss": 0.0629, - "step": 2143 - }, - { - "epoch": 38.981818181818184, - "grad_norm": 0.56639564037323, - "learning_rate": 2.2072727272727275e-06, - "loss": 0.0484, - "step": 2144 - }, - { - "epoch": 39.0, - "grad_norm": 1.3727740049362183, - "learning_rate": 2.203636363636364e-06, - "loss": 0.0643, - "step": 2145 - }, - { - "epoch": 39.0, - "eval_loss": 0.05606410279870033, - "eval_runtime": 9.1338, - "eval_samples_per_second": 596.14, - "eval_steps_per_second": 74.559, - "step": 2145 - }, - { - "epoch": 39.018181818181816, - "grad_norm": 0.9323717355728149, - "learning_rate": 2.2e-06, - "loss": 0.064, - "step": 2146 - }, - { - "epoch": 39.03636363636364, - "grad_norm": 0.39446333050727844, - "learning_rate": 2.1963636363636364e-06, - "loss": 0.0694, - "step": 2147 - }, - { - "epoch": 39.054545454545455, - "grad_norm": 1.5236642360687256, - "learning_rate": 2.192727272727273e-06, - "loss": 0.0673, - "step": 2148 - }, - { - "epoch": 39.07272727272727, - "grad_norm": 3.004516363143921, - "learning_rate": 2.1890909090909094e-06, - "loss": 0.0513, - "step": 2149 - }, - { - "epoch": 39.09090909090909, - "grad_norm": 2.2257559299468994, - "learning_rate": 2.1854545454545457e-06, - "loss": 0.0603, - "step": 2150 - }, - { - "epoch": 39.10909090909091, - "grad_norm": 1.0008474588394165, - "learning_rate": 2.181818181818182e-06, - "loss": 0.0549, - "step": 2151 - }, - { - "epoch": 39.127272727272725, - "grad_norm": 2.3854215145111084, - "learning_rate": 2.1781818181818187e-06, - "loss": 0.0784, - "step": 2152 - }, - { - "epoch": 39.14545454545455, - "grad_norm": 1.5416327714920044, - "learning_rate": 2.174545454545455e-06, - "loss": 0.0607, - "step": 2153 - }, - { - "epoch": 39.163636363636364, - "grad_norm": 1.9058406352996826, - "learning_rate": 2.1709090909090912e-06, - "loss": 0.0699, - "step": 2154 - }, - { - "epoch": 39.18181818181818, - "grad_norm": 0.9141010046005249, - "learning_rate": 2.1672727272727275e-06, - "loss": 0.059, - "step": 2155 - }, - { - "epoch": 39.2, - "grad_norm": 0.3028789162635803, - "learning_rate": 2.163636363636364e-06, - "loss": 0.0598, - "step": 2156 - }, - { - "epoch": 39.21818181818182, - "grad_norm": 1.2137407064437866, - "learning_rate": 2.16e-06, - "loss": 0.0542, - "step": 2157 - }, - { - "epoch": 39.236363636363635, - "grad_norm": 0.9479543566703796, - "learning_rate": 2.1563636363636364e-06, - "loss": 0.0573, - "step": 2158 - }, - { - "epoch": 39.25454545454546, - "grad_norm": 1.3431289196014404, - "learning_rate": 2.152727272727273e-06, - "loss": 0.0532, - "step": 2159 - }, - { - "epoch": 39.27272727272727, - "grad_norm": 0.7350088357925415, - "learning_rate": 2.1490909090909093e-06, - "loss": 0.0635, - "step": 2160 - }, - { - "epoch": 39.29090909090909, - "grad_norm": 0.5319529175758362, - "learning_rate": 2.1454545454545456e-06, - "loss": 0.0522, - "step": 2161 - }, - { - "epoch": 39.30909090909091, - "grad_norm": 0.46826472878456116, - "learning_rate": 2.1418181818181823e-06, - "loss": 0.065, - "step": 2162 - }, - { - "epoch": 39.32727272727273, - "grad_norm": 0.8113547563552856, - "learning_rate": 2.1381818181818186e-06, - "loss": 0.0449, - "step": 2163 - }, - { - "epoch": 39.345454545454544, - "grad_norm": 0.5775340795516968, - "learning_rate": 2.134545454545455e-06, - "loss": 0.0415, - "step": 2164 - }, - { - "epoch": 39.36363636363637, - "grad_norm": 1.3355979919433594, - "learning_rate": 2.130909090909091e-06, - "loss": 0.0708, - "step": 2165 - }, - { - "epoch": 39.38181818181818, - "grad_norm": 0.5057485699653625, - "learning_rate": 2.1272727272727275e-06, - "loss": 0.0611, - "step": 2166 - }, - { - "epoch": 39.4, - "grad_norm": 1.8089934587478638, - "learning_rate": 2.1236363636363637e-06, - "loss": 0.0668, - "step": 2167 - }, - { - "epoch": 39.41818181818182, - "grad_norm": 1.8733937740325928, - "learning_rate": 2.12e-06, - "loss": 0.0502, - "step": 2168 - }, - { - "epoch": 39.43636363636364, - "grad_norm": 0.7304433584213257, - "learning_rate": 2.1163636363636367e-06, - "loss": 0.0727, - "step": 2169 - }, - { - "epoch": 39.45454545454545, - "grad_norm": 1.2420685291290283, - "learning_rate": 2.112727272727273e-06, - "loss": 0.0674, - "step": 2170 - }, - { - "epoch": 39.472727272727276, - "grad_norm": 0.42103111743927, - "learning_rate": 2.1090909090909093e-06, - "loss": 0.0617, - "step": 2171 - }, - { - "epoch": 39.49090909090909, - "grad_norm": 1.4657869338989258, - "learning_rate": 2.1054545454545456e-06, - "loss": 0.0511, - "step": 2172 - }, - { - "epoch": 39.50909090909091, - "grad_norm": 2.7023823261260986, - "learning_rate": 2.1018181818181823e-06, - "loss": 0.0724, - "step": 2173 - }, - { - "epoch": 39.527272727272724, - "grad_norm": 1.486281156539917, - "learning_rate": 2.0981818181818186e-06, - "loss": 0.0468, - "step": 2174 - }, - { - "epoch": 39.54545454545455, - "grad_norm": 0.23237887024879456, - "learning_rate": 2.094545454545455e-06, - "loss": 0.0572, - "step": 2175 - }, - { - "epoch": 39.56363636363636, - "grad_norm": 0.2263251692056656, - "learning_rate": 2.090909090909091e-06, - "loss": 0.0542, - "step": 2176 - }, - { - "epoch": 39.58181818181818, - "grad_norm": 0.22883813083171844, - "learning_rate": 2.0872727272727274e-06, - "loss": 0.0485, - "step": 2177 - }, - { - "epoch": 39.6, - "grad_norm": 0.8684478998184204, - "learning_rate": 2.0836363636363637e-06, - "loss": 0.0564, - "step": 2178 - }, - { - "epoch": 39.61818181818182, - "grad_norm": 0.3134025037288666, - "learning_rate": 2.08e-06, - "loss": 0.0771, - "step": 2179 - }, - { - "epoch": 39.63636363636363, - "grad_norm": 0.4449402689933777, - "learning_rate": 2.0763636363636367e-06, - "loss": 0.0586, - "step": 2180 - }, - { - "epoch": 39.654545454545456, - "grad_norm": 0.5784072875976562, - "learning_rate": 2.072727272727273e-06, - "loss": 0.0487, - "step": 2181 - }, - { - "epoch": 39.67272727272727, - "grad_norm": 1.9227023124694824, - "learning_rate": 2.0690909090909093e-06, - "loss": 0.0569, - "step": 2182 - }, - { - "epoch": 39.69090909090909, - "grad_norm": 0.7631653547286987, - "learning_rate": 2.0654545454545455e-06, - "loss": 0.0542, - "step": 2183 - }, - { - "epoch": 39.70909090909091, - "grad_norm": 0.8173301815986633, - "learning_rate": 2.0618181818181823e-06, - "loss": 0.0623, - "step": 2184 - }, - { - "epoch": 39.72727272727273, - "grad_norm": 0.7579083442687988, - "learning_rate": 2.0581818181818185e-06, - "loss": 0.0532, - "step": 2185 - }, - { - "epoch": 39.74545454545454, - "grad_norm": 0.24974891543388367, - "learning_rate": 2.054545454545455e-06, - "loss": 0.0518, - "step": 2186 - }, - { - "epoch": 39.763636363636365, - "grad_norm": 0.16365793347358704, - "learning_rate": 2.050909090909091e-06, - "loss": 0.0646, - "step": 2187 - }, - { - "epoch": 39.78181818181818, - "grad_norm": 1.2832542657852173, - "learning_rate": 2.0472727272727274e-06, - "loss": 0.0647, - "step": 2188 - }, - { - "epoch": 39.8, - "grad_norm": 1.7822118997573853, - "learning_rate": 2.0436363636363637e-06, - "loss": 0.0488, - "step": 2189 - }, - { - "epoch": 39.81818181818182, - "grad_norm": 0.35288572311401367, - "learning_rate": 2.04e-06, - "loss": 0.0619, - "step": 2190 - }, - { - "epoch": 39.836363636363636, - "grad_norm": 0.3154415488243103, - "learning_rate": 2.0363636363636367e-06, - "loss": 0.0537, - "step": 2191 - }, - { - "epoch": 39.85454545454545, - "grad_norm": 1.7446848154067993, - "learning_rate": 2.032727272727273e-06, - "loss": 0.0595, - "step": 2192 - }, - { - "epoch": 39.872727272727275, - "grad_norm": 1.1527953147888184, - "learning_rate": 2.0290909090909092e-06, - "loss": 0.0586, - "step": 2193 - }, - { - "epoch": 39.89090909090909, - "grad_norm": 1.869132161140442, - "learning_rate": 2.025454545454546e-06, - "loss": 0.0622, - "step": 2194 - }, - { - "epoch": 39.90909090909091, - "grad_norm": 0.8881543278694153, - "learning_rate": 2.0218181818181822e-06, - "loss": 0.0513, - "step": 2195 - }, - { - "epoch": 39.92727272727273, - "grad_norm": 2.5128767490386963, - "learning_rate": 2.0181818181818185e-06, - "loss": 0.0665, - "step": 2196 - }, - { - "epoch": 39.945454545454545, - "grad_norm": 1.7180774211883545, - "learning_rate": 2.0145454545454548e-06, - "loss": 0.0589, - "step": 2197 - }, - { - "epoch": 39.96363636363636, - "grad_norm": 1.2597408294677734, - "learning_rate": 2.010909090909091e-06, - "loss": 0.0545, - "step": 2198 - }, - { - "epoch": 39.981818181818184, - "grad_norm": 0.2117815464735031, - "learning_rate": 2.0072727272727273e-06, - "loss": 0.0589, - "step": 2199 - }, - { - "epoch": 40.0, - "grad_norm": 2.30427622795105, - "learning_rate": 2.0036363636363636e-06, - "loss": 0.0598, - "step": 2200 - }, - { - "epoch": 40.0, - "eval_loss": 0.05686158314347267, - "eval_runtime": 9.1764, - "eval_samples_per_second": 593.371, - "eval_steps_per_second": 74.212, - "step": 2200 - }, - { - "epoch": 40.018181818181816, - "grad_norm": 1.338448405265808, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.0497, - "step": 2201 - }, - { - "epoch": 40.03636363636364, - "grad_norm": 1.7391191720962524, - "learning_rate": 1.9963636363636366e-06, - "loss": 0.0659, - "step": 2202 - }, - { - "epoch": 40.054545454545455, - "grad_norm": 0.32793015241622925, - "learning_rate": 1.992727272727273e-06, - "loss": 0.0613, - "step": 2203 - }, - { - "epoch": 40.07272727272727, - "grad_norm": 0.3332318365573883, - "learning_rate": 1.989090909090909e-06, - "loss": 0.0555, - "step": 2204 - }, - { - "epoch": 40.09090909090909, - "grad_norm": 0.7095208168029785, - "learning_rate": 1.985454545454546e-06, - "loss": 0.0811, - "step": 2205 - }, - { - "epoch": 40.10909090909091, - "grad_norm": 2.633823871612549, - "learning_rate": 1.981818181818182e-06, - "loss": 0.0561, - "step": 2206 - }, - { - "epoch": 40.127272727272725, - "grad_norm": 0.4003574550151825, - "learning_rate": 1.9781818181818185e-06, - "loss": 0.0705, - "step": 2207 - }, - { - "epoch": 40.14545454545455, - "grad_norm": 0.21043196320533752, - "learning_rate": 1.9745454545454547e-06, - "loss": 0.0642, - "step": 2208 - }, - { - "epoch": 40.163636363636364, - "grad_norm": 0.5566392540931702, - "learning_rate": 1.970909090909091e-06, - "loss": 0.0444, - "step": 2209 - }, - { - "epoch": 40.18181818181818, - "grad_norm": 2.367607355117798, - "learning_rate": 1.9672727272727273e-06, - "loss": 0.0679, - "step": 2210 - }, - { - "epoch": 40.2, - "grad_norm": 0.6367204785346985, - "learning_rate": 1.9636363636363636e-06, - "loss": 0.0497, - "step": 2211 - }, - { - "epoch": 40.21818181818182, - "grad_norm": 0.4788474440574646, - "learning_rate": 1.9600000000000003e-06, - "loss": 0.065, - "step": 2212 - }, - { - "epoch": 40.236363636363635, - "grad_norm": 1.9213539361953735, - "learning_rate": 1.9563636363636366e-06, - "loss": 0.06, - "step": 2213 - }, - { - "epoch": 40.25454545454546, - "grad_norm": 1.8869107961654663, - "learning_rate": 1.952727272727273e-06, - "loss": 0.0571, - "step": 2214 - }, - { - "epoch": 40.27272727272727, - "grad_norm": 1.6950314044952393, - "learning_rate": 1.9490909090909096e-06, - "loss": 0.0536, - "step": 2215 - }, - { - "epoch": 40.29090909090909, - "grad_norm": 0.33977311849594116, - "learning_rate": 1.945454545454546e-06, - "loss": 0.0529, - "step": 2216 - }, - { - "epoch": 40.30909090909091, - "grad_norm": 0.8658501505851746, - "learning_rate": 1.941818181818182e-06, - "loss": 0.0426, - "step": 2217 - }, - { - "epoch": 40.32727272727273, - "grad_norm": 0.33184611797332764, - "learning_rate": 1.9381818181818184e-06, - "loss": 0.073, - "step": 2218 - }, - { - "epoch": 40.345454545454544, - "grad_norm": 1.3411803245544434, - "learning_rate": 1.9345454545454547e-06, - "loss": 0.0633, - "step": 2219 - }, - { - "epoch": 40.36363636363637, - "grad_norm": 0.3121744394302368, - "learning_rate": 1.930909090909091e-06, - "loss": 0.0614, - "step": 2220 - }, - { - "epoch": 40.38181818181818, - "grad_norm": 0.31166547536849976, - "learning_rate": 1.9272727272727273e-06, - "loss": 0.0555, - "step": 2221 - }, - { - "epoch": 40.4, - "grad_norm": 0.5633543133735657, - "learning_rate": 1.923636363636364e-06, - "loss": 0.0624, - "step": 2222 - }, - { - "epoch": 40.41818181818182, - "grad_norm": 0.6252700090408325, - "learning_rate": 1.9200000000000003e-06, - "loss": 0.0493, - "step": 2223 - }, - { - "epoch": 40.43636363636364, - "grad_norm": 0.3529735803604126, - "learning_rate": 1.9163636363636365e-06, - "loss": 0.0508, - "step": 2224 - }, - { - "epoch": 40.45454545454545, - "grad_norm": 0.41774749755859375, - "learning_rate": 1.912727272727273e-06, - "loss": 0.0594, - "step": 2225 - }, - { - "epoch": 40.472727272727276, - "grad_norm": 0.2487531304359436, - "learning_rate": 1.9090909090909095e-06, - "loss": 0.0559, - "step": 2226 - }, - { - "epoch": 40.49090909090909, - "grad_norm": 1.3716455698013306, - "learning_rate": 1.9054545454545456e-06, - "loss": 0.0504, - "step": 2227 - }, - { - "epoch": 40.50909090909091, - "grad_norm": 1.405882716178894, - "learning_rate": 1.9018181818181819e-06, - "loss": 0.0745, - "step": 2228 - }, - { - "epoch": 40.527272727272724, - "grad_norm": 0.629143238067627, - "learning_rate": 1.8981818181818184e-06, - "loss": 0.0567, - "step": 2229 - }, - { - "epoch": 40.54545454545455, - "grad_norm": 2.648716926574707, - "learning_rate": 1.8945454545454547e-06, - "loss": 0.0564, - "step": 2230 - }, - { - "epoch": 40.56363636363636, - "grad_norm": 1.545282006263733, - "learning_rate": 1.890909090909091e-06, - "loss": 0.0505, - "step": 2231 - }, - { - "epoch": 40.58181818181818, - "grad_norm": 0.44753435254096985, - "learning_rate": 1.8872727272727272e-06, - "loss": 0.0642, - "step": 2232 - }, - { - "epoch": 40.6, - "grad_norm": 0.45334392786026, - "learning_rate": 1.883636363636364e-06, - "loss": 0.0625, - "step": 2233 - }, - { - "epoch": 40.61818181818182, - "grad_norm": 2.0284602642059326, - "learning_rate": 1.8800000000000002e-06, - "loss": 0.0789, - "step": 2234 - }, - { - "epoch": 40.63636363636363, - "grad_norm": 0.465866357088089, - "learning_rate": 1.8763636363636365e-06, - "loss": 0.0591, - "step": 2235 - }, - { - "epoch": 40.654545454545456, - "grad_norm": 0.33178937435150146, - "learning_rate": 1.872727272727273e-06, - "loss": 0.0535, - "step": 2236 - }, - { - "epoch": 40.67272727272727, - "grad_norm": 0.9513147473335266, - "learning_rate": 1.8690909090909093e-06, - "loss": 0.053, - "step": 2237 - }, - { - "epoch": 40.69090909090909, - "grad_norm": 0.7124369740486145, - "learning_rate": 1.8654545454545456e-06, - "loss": 0.0424, - "step": 2238 - }, - { - "epoch": 40.70909090909091, - "grad_norm": 0.16575339436531067, - "learning_rate": 1.8618181818181818e-06, - "loss": 0.0537, - "step": 2239 - }, - { - "epoch": 40.72727272727273, - "grad_norm": 0.36140555143356323, - "learning_rate": 1.8581818181818183e-06, - "loss": 0.0574, - "step": 2240 - }, - { - "epoch": 40.74545454545454, - "grad_norm": 1.3248333930969238, - "learning_rate": 1.8545454545454546e-06, - "loss": 0.0451, - "step": 2241 - }, - { - "epoch": 40.763636363636365, - "grad_norm": 2.551966667175293, - "learning_rate": 1.850909090909091e-06, - "loss": 0.0762, - "step": 2242 - }, - { - "epoch": 40.78181818181818, - "grad_norm": 1.6012475490570068, - "learning_rate": 1.8472727272727276e-06, - "loss": 0.0495, - "step": 2243 - }, - { - "epoch": 40.8, - "grad_norm": 0.6489143371582031, - "learning_rate": 1.8436363636363639e-06, - "loss": 0.0569, - "step": 2244 - }, - { - "epoch": 40.81818181818182, - "grad_norm": 0.6184863448143005, - "learning_rate": 1.8400000000000002e-06, - "loss": 0.066, - "step": 2245 - }, - { - "epoch": 40.836363636363636, - "grad_norm": 0.4022826850414276, - "learning_rate": 1.8363636363636365e-06, - "loss": 0.0714, - "step": 2246 - }, - { - "epoch": 40.85454545454545, - "grad_norm": 1.8044648170471191, - "learning_rate": 1.832727272727273e-06, - "loss": 0.0497, - "step": 2247 - }, - { - "epoch": 40.872727272727275, - "grad_norm": 0.6852204203605652, - "learning_rate": 1.8290909090909092e-06, - "loss": 0.0346, - "step": 2248 - }, - { - "epoch": 40.89090909090909, - "grad_norm": 1.1548038721084595, - "learning_rate": 1.8254545454545455e-06, - "loss": 0.0513, - "step": 2249 - }, - { - "epoch": 40.90909090909091, - "grad_norm": 0.6014946103096008, - "learning_rate": 1.821818181818182e-06, - "loss": 0.0591, - "step": 2250 - }, - { - "epoch": 40.92727272727273, - "grad_norm": 0.767703115940094, - "learning_rate": 1.8181818181818183e-06, - "loss": 0.0665, - "step": 2251 - }, - { - "epoch": 40.945454545454545, - "grad_norm": 2.49381685256958, - "learning_rate": 1.8145454545454546e-06, - "loss": 0.0798, - "step": 2252 - }, - { - "epoch": 40.96363636363636, - "grad_norm": 0.30272457003593445, - "learning_rate": 1.8109090909090909e-06, - "loss": 0.0524, - "step": 2253 - }, - { - "epoch": 40.981818181818184, - "grad_norm": 2.2215495109558105, - "learning_rate": 1.8072727272727276e-06, - "loss": 0.0493, - "step": 2254 - }, - { - "epoch": 41.0, - "grad_norm": 0.6804737448692322, - "learning_rate": 1.8036363636363638e-06, - "loss": 0.0924, - "step": 2255 - }, - { - "epoch": 41.0, - "eval_loss": 0.05624762177467346, - "eval_runtime": 9.237, - "eval_samples_per_second": 589.48, - "eval_steps_per_second": 73.726, - "step": 2255 - }, - { - "epoch": 41.018181818181816, - "grad_norm": 1.0376977920532227, - "learning_rate": 1.8000000000000001e-06, - "loss": 0.0666, - "step": 2256 - }, - { - "epoch": 41.03636363636364, - "grad_norm": 1.569620132446289, - "learning_rate": 1.7963636363636366e-06, - "loss": 0.0609, - "step": 2257 - }, - { - "epoch": 41.054545454545455, - "grad_norm": 1.2261890172958374, - "learning_rate": 1.792727272727273e-06, - "loss": 0.0578, - "step": 2258 - }, - { - "epoch": 41.07272727272727, - "grad_norm": 1.6558276414871216, - "learning_rate": 1.7890909090909092e-06, - "loss": 0.0476, - "step": 2259 - }, - { - "epoch": 41.09090909090909, - "grad_norm": 0.5721738934516907, - "learning_rate": 1.7854545454545455e-06, - "loss": 0.0592, - "step": 2260 - }, - { - "epoch": 41.10909090909091, - "grad_norm": 0.6912810206413269, - "learning_rate": 1.781818181818182e-06, - "loss": 0.0532, - "step": 2261 - }, - { - "epoch": 41.127272727272725, - "grad_norm": 2.4888651371002197, - "learning_rate": 1.7781818181818183e-06, - "loss": 0.0621, - "step": 2262 - }, - { - "epoch": 41.14545454545455, - "grad_norm": 2.021860122680664, - "learning_rate": 1.7745454545454545e-06, - "loss": 0.0592, - "step": 2263 - }, - { - "epoch": 41.163636363636364, - "grad_norm": 0.7877523303031921, - "learning_rate": 1.7709090909090912e-06, - "loss": 0.0484, - "step": 2264 - }, - { - "epoch": 41.18181818181818, - "grad_norm": 0.2510339617729187, - "learning_rate": 1.7672727272727275e-06, - "loss": 0.0582, - "step": 2265 - }, - { - "epoch": 41.2, - "grad_norm": 1.1524829864501953, - "learning_rate": 1.7636363636363638e-06, - "loss": 0.072, - "step": 2266 - }, - { - "epoch": 41.21818181818182, - "grad_norm": 1.2420432567596436, - "learning_rate": 1.76e-06, - "loss": 0.053, - "step": 2267 - }, - { - "epoch": 41.236363636363635, - "grad_norm": 0.6356487274169922, - "learning_rate": 1.7563636363636366e-06, - "loss": 0.041, - "step": 2268 - }, - { - "epoch": 41.25454545454546, - "grad_norm": 0.2031138688325882, - "learning_rate": 1.7527272727272729e-06, - "loss": 0.0511, - "step": 2269 - }, - { - "epoch": 41.27272727272727, - "grad_norm": 0.6484189033508301, - "learning_rate": 1.7490909090909092e-06, - "loss": 0.0534, - "step": 2270 - }, - { - "epoch": 41.29090909090909, - "grad_norm": 1.060332179069519, - "learning_rate": 1.7454545454545456e-06, - "loss": 0.0535, - "step": 2271 - }, - { - "epoch": 41.30909090909091, - "grad_norm": 2.1795308589935303, - "learning_rate": 1.741818181818182e-06, - "loss": 0.0481, - "step": 2272 - }, - { - "epoch": 41.32727272727273, - "grad_norm": 2.000328540802002, - "learning_rate": 1.7381818181818182e-06, - "loss": 0.0673, - "step": 2273 - }, - { - "epoch": 41.345454545454544, - "grad_norm": 1.08549964427948, - "learning_rate": 1.7345454545454545e-06, - "loss": 0.0598, - "step": 2274 - }, - { - "epoch": 41.36363636363637, - "grad_norm": 0.5538372993469238, - "learning_rate": 1.7309090909090912e-06, - "loss": 0.0491, - "step": 2275 - }, - { - "epoch": 41.38181818181818, - "grad_norm": 0.2963055670261383, - "learning_rate": 1.7272727272727275e-06, - "loss": 0.0684, - "step": 2276 - }, - { - "epoch": 41.4, - "grad_norm": 0.8862787485122681, - "learning_rate": 1.7236363636363638e-06, - "loss": 0.0595, - "step": 2277 - }, - { - "epoch": 41.41818181818182, - "grad_norm": 1.4401406049728394, - "learning_rate": 1.72e-06, - "loss": 0.0586, - "step": 2278 - }, - { - "epoch": 41.43636363636364, - "grad_norm": 0.3585759401321411, - "learning_rate": 1.7163636363636365e-06, - "loss": 0.0693, - "step": 2279 - }, - { - "epoch": 41.45454545454545, - "grad_norm": 0.42267200350761414, - "learning_rate": 1.7127272727272728e-06, - "loss": 0.0706, - "step": 2280 - }, - { - "epoch": 41.472727272727276, - "grad_norm": 1.0285965204238892, - "learning_rate": 1.7090909090909091e-06, - "loss": 0.0548, - "step": 2281 - }, - { - "epoch": 41.49090909090909, - "grad_norm": 1.4405131340026855, - "learning_rate": 1.7054545454545456e-06, - "loss": 0.0594, - "step": 2282 - }, - { - "epoch": 41.50909090909091, - "grad_norm": 0.9643746018409729, - "learning_rate": 1.7018181818181819e-06, - "loss": 0.0524, - "step": 2283 - }, - { - "epoch": 41.527272727272724, - "grad_norm": 0.571611762046814, - "learning_rate": 1.6981818181818182e-06, - "loss": 0.0583, - "step": 2284 - }, - { - "epoch": 41.54545454545455, - "grad_norm": 2.397852897644043, - "learning_rate": 1.6945454545454545e-06, - "loss": 0.0477, - "step": 2285 - }, - { - "epoch": 41.56363636363636, - "grad_norm": 2.1167819499969482, - "learning_rate": 1.6909090909090912e-06, - "loss": 0.0617, - "step": 2286 - }, - { - "epoch": 41.58181818181818, - "grad_norm": 1.5353717803955078, - "learning_rate": 1.6872727272727274e-06, - "loss": 0.0522, - "step": 2287 - }, - { - "epoch": 41.6, - "grad_norm": 0.24240103363990784, - "learning_rate": 1.6836363636363637e-06, - "loss": 0.0609, - "step": 2288 - }, - { - "epoch": 41.61818181818182, - "grad_norm": 0.2295025885105133, - "learning_rate": 1.6800000000000002e-06, - "loss": 0.0508, - "step": 2289 - }, - { - "epoch": 41.63636363636363, - "grad_norm": 0.7964298129081726, - "learning_rate": 1.6763636363636365e-06, - "loss": 0.0591, - "step": 2290 - }, - { - "epoch": 41.654545454545456, - "grad_norm": 1.9023545980453491, - "learning_rate": 1.6727272727272728e-06, - "loss": 0.0896, - "step": 2291 - }, - { - "epoch": 41.67272727272727, - "grad_norm": 1.0932095050811768, - "learning_rate": 1.669090909090909e-06, - "loss": 0.0582, - "step": 2292 - }, - { - "epoch": 41.69090909090909, - "grad_norm": 0.8269695043563843, - "learning_rate": 1.6654545454545456e-06, - "loss": 0.0528, - "step": 2293 - }, - { - "epoch": 41.70909090909091, - "grad_norm": 0.219325989484787, - "learning_rate": 1.6618181818181818e-06, - "loss": 0.0652, - "step": 2294 - }, - { - "epoch": 41.72727272727273, - "grad_norm": 0.7314126491546631, - "learning_rate": 1.6581818181818181e-06, - "loss": 0.0555, - "step": 2295 - }, - { - "epoch": 41.74545454545454, - "grad_norm": 0.8552329540252686, - "learning_rate": 1.6545454545454548e-06, - "loss": 0.0629, - "step": 2296 - }, - { - "epoch": 41.763636363636365, - "grad_norm": 1.2490841150283813, - "learning_rate": 1.6509090909090911e-06, - "loss": 0.0638, - "step": 2297 - }, - { - "epoch": 41.78181818181818, - "grad_norm": 0.8550461530685425, - "learning_rate": 1.6472727272727274e-06, - "loss": 0.0568, - "step": 2298 - }, - { - "epoch": 41.8, - "grad_norm": 1.889506459236145, - "learning_rate": 1.6436363636363637e-06, - "loss": 0.0444, - "step": 2299 - }, - { - "epoch": 41.81818181818182, - "grad_norm": 1.0877445936203003, - "learning_rate": 1.6400000000000002e-06, - "loss": 0.055, - "step": 2300 - }, - { - "epoch": 41.836363636363636, - "grad_norm": 2.4363577365875244, - "learning_rate": 1.6363636363636365e-06, - "loss": 0.053, - "step": 2301 - }, - { - "epoch": 41.85454545454545, - "grad_norm": 1.876415729522705, - "learning_rate": 1.6327272727272727e-06, - "loss": 0.072, - "step": 2302 - }, - { - "epoch": 41.872727272727275, - "grad_norm": 3.156857967376709, - "learning_rate": 1.6290909090909092e-06, - "loss": 0.071, - "step": 2303 - }, - { - "epoch": 41.89090909090909, - "grad_norm": 1.9464811086654663, - "learning_rate": 1.6254545454545455e-06, - "loss": 0.0473, - "step": 2304 - }, - { - "epoch": 41.90909090909091, - "grad_norm": 1.8739292621612549, - "learning_rate": 1.6218181818181818e-06, - "loss": 0.065, - "step": 2305 - }, - { - "epoch": 41.92727272727273, - "grad_norm": 2.5977931022644043, - "learning_rate": 1.618181818181818e-06, - "loss": 0.0818, - "step": 2306 - }, - { - "epoch": 41.945454545454545, - "grad_norm": 0.8249573707580566, - "learning_rate": 1.6145454545454548e-06, - "loss": 0.0548, - "step": 2307 - }, - { - "epoch": 41.96363636363636, - "grad_norm": 1.6713016033172607, - "learning_rate": 1.610909090909091e-06, - "loss": 0.0648, - "step": 2308 - }, - { - "epoch": 41.981818181818184, - "grad_norm": 2.015080213546753, - "learning_rate": 1.6072727272727274e-06, - "loss": 0.0595, - "step": 2309 - }, - { - "epoch": 42.0, - "grad_norm": 3.037482500076294, - "learning_rate": 1.6036363636363639e-06, - "loss": 0.0493, - "step": 2310 - }, - { - "epoch": 42.0, - "eval_loss": 0.05649769306182861, - "eval_runtime": 8.6983, - "eval_samples_per_second": 625.985, - "eval_steps_per_second": 78.291, - "step": 2310 + "eval_loss": 0.2202187031507492, + "eval_runtime": 0.0285, + "eval_samples_per_second": 351.061, + "eval_steps_per_second": 70.212, + "step": 1 } ], "logging_steps": 1, - "max_steps": 2750, + "max_steps": 20, "num_input_tokens_seen": 0, - "num_train_epochs": 50, + "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": {