diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16573 @@ +{ + "best_metric": 0.051427390426397324, + "best_model_checkpoint": "time_base/checkpoint-2340", + "epoch": 20.0, + "eval_steps": 500, + "global_step": 2340, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008547008547008548, + "grad_norm": 221.6373748779297, + "learning_rate": 9.995726495726496e-06, + "loss": 37.5765, + "step": 1 + }, + { + "epoch": 0.017094017094017096, + "grad_norm": 219.50563049316406, + "learning_rate": 9.991452991452993e-06, + "loss": 38.6173, + "step": 2 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 180.23829650878906, + "learning_rate": 9.987179487179488e-06, + "loss": 40.3853, + "step": 3 + }, + { + "epoch": 0.03418803418803419, + "grad_norm": 166.3365478515625, + "learning_rate": 9.982905982905984e-06, + "loss": 35.9724, + "step": 4 + }, + { + "epoch": 0.042735042735042736, + "grad_norm": 199.6571044921875, + "learning_rate": 9.97863247863248e-06, + "loss": 35.0186, + "step": 5 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 180.9748992919922, + "learning_rate": 9.974358974358974e-06, + "loss": 39.3679, + "step": 6 + }, + { + "epoch": 0.05982905982905983, + "grad_norm": 200.05496215820312, + "learning_rate": 9.970085470085471e-06, + "loss": 37.1519, + "step": 7 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 154.3177032470703, + "learning_rate": 9.965811965811966e-06, + "loss": 33.9309, + "step": 8 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 198.05914306640625, + "learning_rate": 9.961538461538463e-06, + "loss": 34.8814, + "step": 9 + }, + { + "epoch": 0.08547008547008547, + "grad_norm": 168.3035430908203, + "learning_rate": 9.957264957264958e-06, + "loss": 33.184, + "step": 10 + }, + { + "epoch": 0.09401709401709402, + "grad_norm": 201.83705139160156, + "learning_rate": 9.952991452991455e-06, + "loss": 35.4025, + "step": 11 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 224.4587860107422, + "learning_rate": 9.94871794871795e-06, + "loss": 39.222, + "step": 12 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 192.1949005126953, + "learning_rate": 9.944444444444445e-06, + "loss": 37.1982, + "step": 13 + }, + { + "epoch": 0.11965811965811966, + "grad_norm": 193.05662536621094, + "learning_rate": 9.940170940170942e-06, + "loss": 38.1325, + "step": 14 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 150.61575317382812, + "learning_rate": 9.935897435897437e-06, + "loss": 34.8682, + "step": 15 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 170.1510772705078, + "learning_rate": 9.931623931623933e-06, + "loss": 33.3652, + "step": 16 + }, + { + "epoch": 0.1452991452991453, + "grad_norm": 193.86875915527344, + "learning_rate": 9.927350427350428e-06, + "loss": 35.0785, + "step": 17 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 164.41986083984375, + "learning_rate": 9.923076923076923e-06, + "loss": 31.9719, + "step": 18 + }, + { + "epoch": 0.1623931623931624, + "grad_norm": 166.08953857421875, + "learning_rate": 9.91880341880342e-06, + "loss": 34.5398, + "step": 19 + }, + { + "epoch": 0.17094017094017094, + "grad_norm": 152.2139892578125, + "learning_rate": 9.914529914529915e-06, + "loss": 36.9092, + "step": 20 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 198.23095703125, + "learning_rate": 9.910256410256412e-06, + "loss": 35.6744, + "step": 21 + }, + { + "epoch": 0.18803418803418803, + "grad_norm": 174.7784881591797, + "learning_rate": 9.905982905982907e-06, + "loss": 32.8258, + "step": 22 + }, + { + "epoch": 0.19658119658119658, + "grad_norm": 133.69859313964844, + "learning_rate": 9.901709401709402e-06, + "loss": 31.431, + "step": 23 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 217.17169189453125, + "learning_rate": 9.897435897435899e-06, + "loss": 38.5649, + "step": 24 + }, + { + "epoch": 0.21367521367521367, + "grad_norm": 172.4914093017578, + "learning_rate": 9.893162393162394e-06, + "loss": 33.9858, + "step": 25 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 186.39654541015625, + "learning_rate": 9.88888888888889e-06, + "loss": 32.8029, + "step": 26 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 183.65159606933594, + "learning_rate": 9.884615384615386e-06, + "loss": 35.8633, + "step": 27 + }, + { + "epoch": 0.23931623931623933, + "grad_norm": 228.352294921875, + "learning_rate": 9.880341880341882e-06, + "loss": 35.0285, + "step": 28 + }, + { + "epoch": 0.24786324786324787, + "grad_norm": 156.77906799316406, + "learning_rate": 9.876068376068377e-06, + "loss": 29.2608, + "step": 29 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 232.8336181640625, + "learning_rate": 9.871794871794872e-06, + "loss": 35.0349, + "step": 30 + }, + { + "epoch": 0.26495726495726496, + "grad_norm": 248.63247680664062, + "learning_rate": 9.86752136752137e-06, + "loss": 34.5067, + "step": 31 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 183.5840606689453, + "learning_rate": 9.863247863247864e-06, + "loss": 30.4758, + "step": 32 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 160.54530334472656, + "learning_rate": 9.858974358974361e-06, + "loss": 31.7959, + "step": 33 + }, + { + "epoch": 0.2905982905982906, + "grad_norm": 199.88156127929688, + "learning_rate": 9.854700854700856e-06, + "loss": 35.6482, + "step": 34 + }, + { + "epoch": 0.29914529914529914, + "grad_norm": 272.9530029296875, + "learning_rate": 9.850427350427351e-06, + "loss": 33.0804, + "step": 35 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 200.0990447998047, + "learning_rate": 9.846153846153848e-06, + "loss": 33.2675, + "step": 36 + }, + { + "epoch": 0.3162393162393162, + "grad_norm": 202.014404296875, + "learning_rate": 9.841880341880343e-06, + "loss": 30.8991, + "step": 37 + }, + { + "epoch": 0.3247863247863248, + "grad_norm": 181.14865112304688, + "learning_rate": 9.837606837606838e-06, + "loss": 32.3643, + "step": 38 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 134.43423461914062, + "learning_rate": 9.833333333333333e-06, + "loss": 30.8094, + "step": 39 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 155.96640014648438, + "learning_rate": 9.82905982905983e-06, + "loss": 31.7564, + "step": 40 + }, + { + "epoch": 0.3504273504273504, + "grad_norm": 146.9285888671875, + "learning_rate": 9.824786324786325e-06, + "loss": 31.9905, + "step": 41 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 159.67974853515625, + "learning_rate": 9.820512820512821e-06, + "loss": 32.5029, + "step": 42 + }, + { + "epoch": 0.36752136752136755, + "grad_norm": 172.4975128173828, + "learning_rate": 9.816239316239316e-06, + "loss": 31.2049, + "step": 43 + }, + { + "epoch": 0.37606837606837606, + "grad_norm": 148.97573852539062, + "learning_rate": 9.811965811965812e-06, + "loss": 27.1673, + "step": 44 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 115.93009185791016, + "learning_rate": 9.807692307692308e-06, + "loss": 30.3342, + "step": 45 + }, + { + "epoch": 0.39316239316239315, + "grad_norm": 184.13145446777344, + "learning_rate": 9.803418803418803e-06, + "loss": 32.317, + "step": 46 + }, + { + "epoch": 0.4017094017094017, + "grad_norm": 139.3995361328125, + "learning_rate": 9.7991452991453e-06, + "loss": 29.9643, + "step": 47 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 184.97996520996094, + "learning_rate": 9.794871794871795e-06, + "loss": 30.6427, + "step": 48 + }, + { + "epoch": 0.4188034188034188, + "grad_norm": 120.04417419433594, + "learning_rate": 9.790598290598292e-06, + "loss": 26.9772, + "step": 49 + }, + { + "epoch": 0.42735042735042733, + "grad_norm": 183.2873077392578, + "learning_rate": 9.786324786324787e-06, + "loss": 31.6688, + "step": 50 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 206.44898986816406, + "learning_rate": 9.782051282051282e-06, + "loss": 32.0574, + "step": 51 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 180.7601318359375, + "learning_rate": 9.777777777777779e-06, + "loss": 31.2178, + "step": 52 + }, + { + "epoch": 0.452991452991453, + "grad_norm": 150.44012451171875, + "learning_rate": 9.773504273504274e-06, + "loss": 29.9826, + "step": 53 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 119.02840423583984, + "learning_rate": 9.76923076923077e-06, + "loss": 26.876, + "step": 54 + }, + { + "epoch": 0.4700854700854701, + "grad_norm": 164.58209228515625, + "learning_rate": 9.764957264957265e-06, + "loss": 28.1059, + "step": 55 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 160.416259765625, + "learning_rate": 9.76068376068376e-06, + "loss": 28.7022, + "step": 56 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 177.29747009277344, + "learning_rate": 9.756410256410257e-06, + "loss": 30.7275, + "step": 57 + }, + { + "epoch": 0.49572649572649574, + "grad_norm": 153.59686279296875, + "learning_rate": 9.752136752136752e-06, + "loss": 28.5575, + "step": 58 + }, + { + "epoch": 0.5042735042735043, + "grad_norm": 155.79617309570312, + "learning_rate": 9.747863247863249e-06, + "loss": 28.1139, + "step": 59 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 173.02581787109375, + "learning_rate": 9.743589743589744e-06, + "loss": 30.4744, + "step": 60 + }, + { + "epoch": 0.5213675213675214, + "grad_norm": 125.31639862060547, + "learning_rate": 9.739316239316239e-06, + "loss": 26.5559, + "step": 61 + }, + { + "epoch": 0.5299145299145299, + "grad_norm": 149.00302124023438, + "learning_rate": 9.735042735042736e-06, + "loss": 30.4065, + "step": 62 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 101.76395416259766, + "learning_rate": 9.730769230769231e-06, + "loss": 25.8895, + "step": 63 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 134.40159606933594, + "learning_rate": 9.726495726495728e-06, + "loss": 26.9317, + "step": 64 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 151.01914978027344, + "learning_rate": 9.722222222222223e-06, + "loss": 27.9913, + "step": 65 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 124.92068481445312, + "learning_rate": 9.71794871794872e-06, + "loss": 26.7874, + "step": 66 + }, + { + "epoch": 0.5726495726495726, + "grad_norm": 131.29762268066406, + "learning_rate": 9.713675213675214e-06, + "loss": 27.4047, + "step": 67 + }, + { + "epoch": 0.5811965811965812, + "grad_norm": 154.37120056152344, + "learning_rate": 9.70940170940171e-06, + "loss": 26.6812, + "step": 68 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 86.31095886230469, + "learning_rate": 9.705128205128206e-06, + "loss": 22.9869, + "step": 69 + }, + { + "epoch": 0.5982905982905983, + "grad_norm": 224.42613220214844, + "learning_rate": 9.700854700854701e-06, + "loss": 28.4812, + "step": 70 + }, + { + "epoch": 0.6068376068376068, + "grad_norm": 156.15228271484375, + "learning_rate": 9.696581196581198e-06, + "loss": 26.1761, + "step": 71 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 117.7806167602539, + "learning_rate": 9.692307692307693e-06, + "loss": 20.7307, + "step": 72 + }, + { + "epoch": 0.6239316239316239, + "grad_norm": 169.99154663085938, + "learning_rate": 9.688034188034188e-06, + "loss": 27.6369, + "step": 73 + }, + { + "epoch": 0.6324786324786325, + "grad_norm": 98.81549072265625, + "learning_rate": 9.683760683760685e-06, + "loss": 24.5898, + "step": 74 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 199.0179443359375, + "learning_rate": 9.67948717948718e-06, + "loss": 27.664, + "step": 75 + }, + { + "epoch": 0.6495726495726496, + "grad_norm": 129.81033325195312, + "learning_rate": 9.675213675213677e-06, + "loss": 25.2547, + "step": 76 + }, + { + "epoch": 0.6581196581196581, + "grad_norm": 140.1121826171875, + "learning_rate": 9.670940170940172e-06, + "loss": 27.4914, + "step": 77 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 139.8365478515625, + "learning_rate": 9.666666666666667e-06, + "loss": 24.0178, + "step": 78 + }, + { + "epoch": 0.6752136752136753, + "grad_norm": 147.24945068359375, + "learning_rate": 9.662393162393163e-06, + "loss": 27.1404, + "step": 79 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 165.67242431640625, + "learning_rate": 9.658119658119659e-06, + "loss": 25.6604, + "step": 80 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 114.36772918701172, + "learning_rate": 9.653846153846155e-06, + "loss": 24.3695, + "step": 81 + }, + { + "epoch": 0.7008547008547008, + "grad_norm": 149.76258850097656, + "learning_rate": 9.64957264957265e-06, + "loss": 26.5265, + "step": 82 + }, + { + "epoch": 0.7094017094017094, + "grad_norm": 121.9085693359375, + "learning_rate": 9.645299145299147e-06, + "loss": 25.7008, + "step": 83 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 106.49151611328125, + "learning_rate": 9.641025641025642e-06, + "loss": 20.5777, + "step": 84 + }, + { + "epoch": 0.7264957264957265, + "grad_norm": 114.2357406616211, + "learning_rate": 9.636752136752137e-06, + "loss": 23.3429, + "step": 85 + }, + { + "epoch": 0.7350427350427351, + "grad_norm": 107.38651275634766, + "learning_rate": 9.632478632478634e-06, + "loss": 24.6408, + "step": 86 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 120.4283218383789, + "learning_rate": 9.628205128205129e-06, + "loss": 23.4563, + "step": 87 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 165.21783447265625, + "learning_rate": 9.623931623931626e-06, + "loss": 25.878, + "step": 88 + }, + { + "epoch": 0.7606837606837606, + "grad_norm": 105.8712387084961, + "learning_rate": 9.61965811965812e-06, + "loss": 23.605, + "step": 89 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 147.31253051757812, + "learning_rate": 9.615384615384616e-06, + "loss": 24.537, + "step": 90 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 127.37718963623047, + "learning_rate": 9.611111111111112e-06, + "loss": 24.6762, + "step": 91 + }, + { + "epoch": 0.7863247863247863, + "grad_norm": 139.40553283691406, + "learning_rate": 9.606837606837607e-06, + "loss": 23.6076, + "step": 92 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 218.39170837402344, + "learning_rate": 9.602564102564104e-06, + "loss": 25.2559, + "step": 93 + }, + { + "epoch": 0.8034188034188035, + "grad_norm": 115.83401489257812, + "learning_rate": 9.5982905982906e-06, + "loss": 23.6758, + "step": 94 + }, + { + "epoch": 0.811965811965812, + "grad_norm": 115.8538818359375, + "learning_rate": 9.594017094017094e-06, + "loss": 24.2789, + "step": 95 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 122.31534576416016, + "learning_rate": 9.589743589743591e-06, + "loss": 23.5114, + "step": 96 + }, + { + "epoch": 0.8290598290598291, + "grad_norm": 171.58558654785156, + "learning_rate": 9.585470085470086e-06, + "loss": 24.7028, + "step": 97 + }, + { + "epoch": 0.8376068376068376, + "grad_norm": 113.29806518554688, + "learning_rate": 9.581196581196583e-06, + "loss": 24.9667, + "step": 98 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 183.74928283691406, + "learning_rate": 9.576923076923078e-06, + "loss": 24.7776, + "step": 99 + }, + { + "epoch": 0.8547008547008547, + "grad_norm": 139.84701538085938, + "learning_rate": 9.572649572649575e-06, + "loss": 22.1558, + "step": 100 + }, + { + "epoch": 0.8632478632478633, + "grad_norm": 145.9014129638672, + "learning_rate": 9.56837606837607e-06, + "loss": 23.0282, + "step": 101 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 195.9859619140625, + "learning_rate": 9.564102564102565e-06, + "loss": 23.7194, + "step": 102 + }, + { + "epoch": 0.8803418803418803, + "grad_norm": 70.51985168457031, + "learning_rate": 9.559829059829061e-06, + "loss": 16.9605, + "step": 103 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 184.04209899902344, + "learning_rate": 9.555555555555556e-06, + "loss": 23.4229, + "step": 104 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 177.86727905273438, + "learning_rate": 9.551282051282053e-06, + "loss": 23.6004, + "step": 105 + }, + { + "epoch": 0.905982905982906, + "grad_norm": 154.30784606933594, + "learning_rate": 9.547008547008548e-06, + "loss": 21.6725, + "step": 106 + }, + { + "epoch": 0.9145299145299145, + "grad_norm": 104.27069854736328, + "learning_rate": 9.542735042735043e-06, + "loss": 22.856, + "step": 107 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 157.4270477294922, + "learning_rate": 9.53846153846154e-06, + "loss": 24.398, + "step": 108 + }, + { + "epoch": 0.9316239316239316, + "grad_norm": 123.56739807128906, + "learning_rate": 9.534188034188035e-06, + "loss": 20.6925, + "step": 109 + }, + { + "epoch": 0.9401709401709402, + "grad_norm": 106.64054870605469, + "learning_rate": 9.52991452991453e-06, + "loss": 23.5794, + "step": 110 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 88.68234252929688, + "learning_rate": 9.525641025641025e-06, + "loss": 20.729, + "step": 111 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 91.86422729492188, + "learning_rate": 9.521367521367522e-06, + "loss": 18.7701, + "step": 112 + }, + { + "epoch": 0.9658119658119658, + "grad_norm": 118.74354553222656, + "learning_rate": 9.517094017094017e-06, + "loss": 20.8439, + "step": 113 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 120.72904968261719, + "learning_rate": 9.512820512820514e-06, + "loss": 21.1903, + "step": 114 + }, + { + "epoch": 0.9829059829059829, + "grad_norm": 107.36665344238281, + "learning_rate": 9.508547008547009e-06, + "loss": 19.3457, + "step": 115 + }, + { + "epoch": 0.9914529914529915, + "grad_norm": 131.74441528320312, + "learning_rate": 9.504273504273504e-06, + "loss": 21.4035, + "step": 116 + }, + { + "epoch": 1.0, + "grad_norm": 161.97703552246094, + "learning_rate": 9.5e-06, + "loss": 22.3831, + "step": 117 + }, + { + "epoch": 1.0, + "eval_loss": 17.230430603027344, + "eval_runtime": 9.9187, + "eval_samples_per_second": 46.982, + "eval_steps_per_second": 5.948, + "step": 117 + }, + { + "epoch": 1.0085470085470085, + "grad_norm": 109.44770050048828, + "learning_rate": 9.495726495726496e-06, + "loss": 20.3406, + "step": 118 + }, + { + "epoch": 1.017094017094017, + "grad_norm": 96.50030517578125, + "learning_rate": 9.491452991452992e-06, + "loss": 19.8086, + "step": 119 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 91.27509307861328, + "learning_rate": 9.487179487179487e-06, + "loss": 18.8737, + "step": 120 + }, + { + "epoch": 1.0341880341880343, + "grad_norm": 123.94478607177734, + "learning_rate": 9.482905982905984e-06, + "loss": 20.1785, + "step": 121 + }, + { + "epoch": 1.0427350427350428, + "grad_norm": 109.29426574707031, + "learning_rate": 9.478632478632479e-06, + "loss": 18.8151, + "step": 122 + }, + { + "epoch": 1.0512820512820513, + "grad_norm": 104.0233383178711, + "learning_rate": 9.474358974358974e-06, + "loss": 19.6281, + "step": 123 + }, + { + "epoch": 1.0598290598290598, + "grad_norm": 75.7523193359375, + "learning_rate": 9.470085470085471e-06, + "loss": 18.5031, + "step": 124 + }, + { + "epoch": 1.0683760683760684, + "grad_norm": 103.1374740600586, + "learning_rate": 9.465811965811966e-06, + "loss": 19.6443, + "step": 125 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 92.68035888671875, + "learning_rate": 9.461538461538463e-06, + "loss": 19.7327, + "step": 126 + }, + { + "epoch": 1.0854700854700854, + "grad_norm": 88.10079193115234, + "learning_rate": 9.457264957264958e-06, + "loss": 17.8832, + "step": 127 + }, + { + "epoch": 1.0940170940170941, + "grad_norm": 80.04244232177734, + "learning_rate": 9.452991452991453e-06, + "loss": 16.4485, + "step": 128 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 81.02445983886719, + "learning_rate": 9.44871794871795e-06, + "loss": 17.3035, + "step": 129 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 98.95979309082031, + "learning_rate": 9.444444444444445e-06, + "loss": 17.5734, + "step": 130 + }, + { + "epoch": 1.1196581196581197, + "grad_norm": 109.76984405517578, + "learning_rate": 9.440170940170941e-06, + "loss": 20.3985, + "step": 131 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 98.52857208251953, + "learning_rate": 9.435897435897436e-06, + "loss": 17.7275, + "step": 132 + }, + { + "epoch": 1.1367521367521367, + "grad_norm": 91.28802490234375, + "learning_rate": 9.431623931623931e-06, + "loss": 17.9107, + "step": 133 + }, + { + "epoch": 1.1452991452991452, + "grad_norm": 92.89081573486328, + "learning_rate": 9.427350427350428e-06, + "loss": 18.2876, + "step": 134 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 78.9795150756836, + "learning_rate": 9.423076923076923e-06, + "loss": 15.5738, + "step": 135 + }, + { + "epoch": 1.1623931623931625, + "grad_norm": 83.77166748046875, + "learning_rate": 9.41880341880342e-06, + "loss": 16.0825, + "step": 136 + }, + { + "epoch": 1.170940170940171, + "grad_norm": 129.62966918945312, + "learning_rate": 9.414529914529915e-06, + "loss": 18.4077, + "step": 137 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 110.26199340820312, + "learning_rate": 9.410256410256412e-06, + "loss": 17.6436, + "step": 138 + }, + { + "epoch": 1.188034188034188, + "grad_norm": 95.36865997314453, + "learning_rate": 9.405982905982907e-06, + "loss": 19.0424, + "step": 139 + }, + { + "epoch": 1.1965811965811965, + "grad_norm": 98.36263275146484, + "learning_rate": 9.401709401709402e-06, + "loss": 16.6122, + "step": 140 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 83.68401336669922, + "learning_rate": 9.397435897435899e-06, + "loss": 14.9218, + "step": 141 + }, + { + "epoch": 1.2136752136752136, + "grad_norm": 92.4602279663086, + "learning_rate": 9.393162393162394e-06, + "loss": 16.3563, + "step": 142 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 106.41629791259766, + "learning_rate": 9.38888888888889e-06, + "loss": 16.4447, + "step": 143 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 97.70237731933594, + "learning_rate": 9.384615384615385e-06, + "loss": 16.8154, + "step": 144 + }, + { + "epoch": 1.2393162393162394, + "grad_norm": 76.88361358642578, + "learning_rate": 9.38034188034188e-06, + "loss": 15.7116, + "step": 145 + }, + { + "epoch": 1.2478632478632479, + "grad_norm": 104.20966339111328, + "learning_rate": 9.376068376068377e-06, + "loss": 15.2283, + "step": 146 + }, + { + "epoch": 1.2564102564102564, + "grad_norm": 80.29965209960938, + "learning_rate": 9.371794871794872e-06, + "loss": 15.3238, + "step": 147 + }, + { + "epoch": 1.264957264957265, + "grad_norm": 72.6979751586914, + "learning_rate": 9.367521367521369e-06, + "loss": 14.2293, + "step": 148 + }, + { + "epoch": 1.2735042735042734, + "grad_norm": 80.29464721679688, + "learning_rate": 9.363247863247864e-06, + "loss": 11.9706, + "step": 149 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 97.80663299560547, + "learning_rate": 9.358974358974359e-06, + "loss": 14.3517, + "step": 150 + }, + { + "epoch": 1.2905982905982907, + "grad_norm": 75.88921356201172, + "learning_rate": 9.354700854700856e-06, + "loss": 12.8289, + "step": 151 + }, + { + "epoch": 1.2991452991452992, + "grad_norm": 75.71963500976562, + "learning_rate": 9.35042735042735e-06, + "loss": 15.2496, + "step": 152 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 84.5454330444336, + "learning_rate": 9.346153846153847e-06, + "loss": 15.7946, + "step": 153 + }, + { + "epoch": 1.3162393162393162, + "grad_norm": 92.24919128417969, + "learning_rate": 9.341880341880343e-06, + "loss": 13.2751, + "step": 154 + }, + { + "epoch": 1.3247863247863247, + "grad_norm": 76.51255798339844, + "learning_rate": 9.33760683760684e-06, + "loss": 14.1861, + "step": 155 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 74.59149169921875, + "learning_rate": 9.333333333333334e-06, + "loss": 12.1881, + "step": 156 + }, + { + "epoch": 1.341880341880342, + "grad_norm": 69.84959411621094, + "learning_rate": 9.32905982905983e-06, + "loss": 13.1244, + "step": 157 + }, + { + "epoch": 1.3504273504273505, + "grad_norm": 82.09815979003906, + "learning_rate": 9.324786324786326e-06, + "loss": 12.7492, + "step": 158 + }, + { + "epoch": 1.358974358974359, + "grad_norm": 87.25080108642578, + "learning_rate": 9.320512820512821e-06, + "loss": 15.5268, + "step": 159 + }, + { + "epoch": 1.3675213675213675, + "grad_norm": 51.60975646972656, + "learning_rate": 9.316239316239318e-06, + "loss": 10.9868, + "step": 160 + }, + { + "epoch": 1.376068376068376, + "grad_norm": 65.10023498535156, + "learning_rate": 9.311965811965813e-06, + "loss": 13.2106, + "step": 161 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 86.3865737915039, + "learning_rate": 9.307692307692308e-06, + "loss": 12.4873, + "step": 162 + }, + { + "epoch": 1.393162393162393, + "grad_norm": 89.5868911743164, + "learning_rate": 9.303418803418805e-06, + "loss": 12.3125, + "step": 163 + }, + { + "epoch": 1.4017094017094016, + "grad_norm": 87.308837890625, + "learning_rate": 9.2991452991453e-06, + "loss": 13.1855, + "step": 164 + }, + { + "epoch": 1.4102564102564101, + "grad_norm": 79.86372375488281, + "learning_rate": 9.294871794871796e-06, + "loss": 11.2756, + "step": 165 + }, + { + "epoch": 1.4188034188034189, + "grad_norm": 64.42597961425781, + "learning_rate": 9.290598290598292e-06, + "loss": 11.7395, + "step": 166 + }, + { + "epoch": 1.4273504273504274, + "grad_norm": 64.65245056152344, + "learning_rate": 9.286324786324787e-06, + "loss": 10.2739, + "step": 167 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 49.57310104370117, + "learning_rate": 9.282051282051283e-06, + "loss": 11.4798, + "step": 168 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 89.93653106689453, + "learning_rate": 9.277777777777778e-06, + "loss": 13.8041, + "step": 169 + }, + { + "epoch": 1.452991452991453, + "grad_norm": 59.6973876953125, + "learning_rate": 9.273504273504275e-06, + "loss": 11.0414, + "step": 170 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 63.07640838623047, + "learning_rate": 9.26923076923077e-06, + "loss": 10.2649, + "step": 171 + }, + { + "epoch": 1.4700854700854702, + "grad_norm": 121.3633041381836, + "learning_rate": 9.264957264957267e-06, + "loss": 11.9233, + "step": 172 + }, + { + "epoch": 1.4786324786324787, + "grad_norm": 50.96989822387695, + "learning_rate": 9.260683760683762e-06, + "loss": 8.3527, + "step": 173 + }, + { + "epoch": 1.4871794871794872, + "grad_norm": 71.61744689941406, + "learning_rate": 9.256410256410257e-06, + "loss": 11.4237, + "step": 174 + }, + { + "epoch": 1.4957264957264957, + "grad_norm": 69.43048858642578, + "learning_rate": 9.252136752136754e-06, + "loss": 9.9193, + "step": 175 + }, + { + "epoch": 1.5042735042735043, + "grad_norm": 130.2714385986328, + "learning_rate": 9.247863247863249e-06, + "loss": 12.0676, + "step": 176 + }, + { + "epoch": 1.5128205128205128, + "grad_norm": 51.40456008911133, + "learning_rate": 9.243589743589745e-06, + "loss": 9.2348, + "step": 177 + }, + { + "epoch": 1.5213675213675213, + "grad_norm": 48.94670486450195, + "learning_rate": 9.23931623931624e-06, + "loss": 8.8217, + "step": 178 + }, + { + "epoch": 1.5299145299145298, + "grad_norm": 54.54533386230469, + "learning_rate": 9.235042735042736e-06, + "loss": 9.2478, + "step": 179 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 46.581939697265625, + "learning_rate": 9.230769230769232e-06, + "loss": 8.746, + "step": 180 + }, + { + "epoch": 1.547008547008547, + "grad_norm": 49.31954574584961, + "learning_rate": 9.226495726495727e-06, + "loss": 8.7889, + "step": 181 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 48.5145378112793, + "learning_rate": 9.222222222222224e-06, + "loss": 8.4478, + "step": 182 + }, + { + "epoch": 1.564102564102564, + "grad_norm": 49.587825775146484, + "learning_rate": 9.217948717948717e-06, + "loss": 10.5022, + "step": 183 + }, + { + "epoch": 1.5726495726495726, + "grad_norm": 47.89423751831055, + "learning_rate": 9.213675213675214e-06, + "loss": 8.7681, + "step": 184 + }, + { + "epoch": 1.5811965811965814, + "grad_norm": 59.971920013427734, + "learning_rate": 9.20940170940171e-06, + "loss": 9.6469, + "step": 185 + }, + { + "epoch": 1.5897435897435899, + "grad_norm": 41.139957427978516, + "learning_rate": 9.205128205128206e-06, + "loss": 8.5196, + "step": 186 + }, + { + "epoch": 1.5982905982905984, + "grad_norm": 36.8078498840332, + "learning_rate": 9.200854700854701e-06, + "loss": 8.2513, + "step": 187 + }, + { + "epoch": 1.606837606837607, + "grad_norm": 62.23011016845703, + "learning_rate": 9.196581196581196e-06, + "loss": 9.239, + "step": 188 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 41.35377502441406, + "learning_rate": 9.192307692307693e-06, + "loss": 8.6788, + "step": 189 + }, + { + "epoch": 1.623931623931624, + "grad_norm": 53.734134674072266, + "learning_rate": 9.188034188034188e-06, + "loss": 8.2624, + "step": 190 + }, + { + "epoch": 1.6324786324786325, + "grad_norm": 60.738887786865234, + "learning_rate": 9.183760683760685e-06, + "loss": 9.2777, + "step": 191 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 26.411643981933594, + "learning_rate": 9.17948717948718e-06, + "loss": 7.6894, + "step": 192 + }, + { + "epoch": 1.6495726495726495, + "grad_norm": 37.81135940551758, + "learning_rate": 9.175213675213676e-06, + "loss": 8.009, + "step": 193 + }, + { + "epoch": 1.658119658119658, + "grad_norm": 42.451080322265625, + "learning_rate": 9.170940170940171e-06, + "loss": 8.309, + "step": 194 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 54.87519073486328, + "learning_rate": 9.166666666666666e-06, + "loss": 8.3505, + "step": 195 + }, + { + "epoch": 1.6752136752136753, + "grad_norm": 47.997737884521484, + "learning_rate": 9.162393162393163e-06, + "loss": 8.9444, + "step": 196 + }, + { + "epoch": 1.6837606837606838, + "grad_norm": 33.1911506652832, + "learning_rate": 9.158119658119658e-06, + "loss": 6.8856, + "step": 197 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 28.42953872680664, + "learning_rate": 9.153846153846155e-06, + "loss": 7.0575, + "step": 198 + }, + { + "epoch": 1.7008547008547008, + "grad_norm": 34.74330520629883, + "learning_rate": 9.14957264957265e-06, + "loss": 7.6837, + "step": 199 + }, + { + "epoch": 1.7094017094017095, + "grad_norm": 27.730812072753906, + "learning_rate": 9.145299145299145e-06, + "loss": 7.2591, + "step": 200 + }, + { + "epoch": 1.717948717948718, + "grad_norm": 36.658966064453125, + "learning_rate": 9.141025641025642e-06, + "loss": 7.6744, + "step": 201 + }, + { + "epoch": 1.7264957264957266, + "grad_norm": 52.580074310302734, + "learning_rate": 9.136752136752137e-06, + "loss": 8.9746, + "step": 202 + }, + { + "epoch": 1.735042735042735, + "grad_norm": 26.30430030822754, + "learning_rate": 9.132478632478634e-06, + "loss": 7.0829, + "step": 203 + }, + { + "epoch": 1.7435897435897436, + "grad_norm": 35.77456283569336, + "learning_rate": 9.128205128205129e-06, + "loss": 7.46, + "step": 204 + }, + { + "epoch": 1.7521367521367521, + "grad_norm": 46.80126953125, + "learning_rate": 9.123931623931624e-06, + "loss": 8.0331, + "step": 205 + }, + { + "epoch": 1.7606837606837606, + "grad_norm": 26.510988235473633, + "learning_rate": 9.11965811965812e-06, + "loss": 7.0434, + "step": 206 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 30.846357345581055, + "learning_rate": 9.115384615384615e-06, + "loss": 6.9022, + "step": 207 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 45.06099319458008, + "learning_rate": 9.111111111111112e-06, + "loss": 7.108, + "step": 208 + }, + { + "epoch": 1.7863247863247862, + "grad_norm": 40.050079345703125, + "learning_rate": 9.106837606837607e-06, + "loss": 7.3628, + "step": 209 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 32.066261291503906, + "learning_rate": 9.102564102564104e-06, + "loss": 7.3292, + "step": 210 + }, + { + "epoch": 1.8034188034188035, + "grad_norm": 29.196252822875977, + "learning_rate": 9.098290598290599e-06, + "loss": 6.6194, + "step": 211 + }, + { + "epoch": 1.811965811965812, + "grad_norm": 34.54549026489258, + "learning_rate": 9.094017094017094e-06, + "loss": 7.224, + "step": 212 + }, + { + "epoch": 1.8205128205128205, + "grad_norm": 31.863550186157227, + "learning_rate": 9.08974358974359e-06, + "loss": 7.141, + "step": 213 + }, + { + "epoch": 1.8290598290598292, + "grad_norm": 36.79090118408203, + "learning_rate": 9.085470085470086e-06, + "loss": 6.9572, + "step": 214 + }, + { + "epoch": 1.8376068376068377, + "grad_norm": 24.298635482788086, + "learning_rate": 9.081196581196583e-06, + "loss": 6.6881, + "step": 215 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 16.75456428527832, + "learning_rate": 9.076923076923078e-06, + "loss": 6.4055, + "step": 216 + }, + { + "epoch": 1.8547008547008548, + "grad_norm": 20.152400970458984, + "learning_rate": 9.072649572649573e-06, + "loss": 6.9078, + "step": 217 + }, + { + "epoch": 1.8632478632478633, + "grad_norm": 34.73337173461914, + "learning_rate": 9.06837606837607e-06, + "loss": 6.7923, + "step": 218 + }, + { + "epoch": 1.8717948717948718, + "grad_norm": 28.418310165405273, + "learning_rate": 9.064102564102564e-06, + "loss": 6.9382, + "step": 219 + }, + { + "epoch": 1.8803418803418803, + "grad_norm": 13.454174995422363, + "learning_rate": 9.059829059829061e-06, + "loss": 4.5504, + "step": 220 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 20.746938705444336, + "learning_rate": 9.055555555555556e-06, + "loss": 6.4711, + "step": 221 + }, + { + "epoch": 1.8974358974358974, + "grad_norm": 23.29437828063965, + "learning_rate": 9.051282051282051e-06, + "loss": 6.1381, + "step": 222 + }, + { + "epoch": 1.9059829059829059, + "grad_norm": 31.720672607421875, + "learning_rate": 9.047008547008548e-06, + "loss": 6.7716, + "step": 223 + }, + { + "epoch": 1.9145299145299144, + "grad_norm": 16.971572875976562, + "learning_rate": 9.042735042735043e-06, + "loss": 6.4734, + "step": 224 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 25.185396194458008, + "learning_rate": 9.03846153846154e-06, + "loss": 6.2505, + "step": 225 + }, + { + "epoch": 1.9316239316239316, + "grad_norm": 42.373863220214844, + "learning_rate": 9.034188034188035e-06, + "loss": 7.1968, + "step": 226 + }, + { + "epoch": 1.9401709401709402, + "grad_norm": 21.06004524230957, + "learning_rate": 9.029914529914532e-06, + "loss": 6.082, + "step": 227 + }, + { + "epoch": 1.9487179487179487, + "grad_norm": 21.413599014282227, + "learning_rate": 9.025641025641027e-06, + "loss": 6.2279, + "step": 228 + }, + { + "epoch": 1.9572649572649574, + "grad_norm": 18.379974365234375, + "learning_rate": 9.021367521367522e-06, + "loss": 6.6032, + "step": 229 + }, + { + "epoch": 1.965811965811966, + "grad_norm": 28.239042282104492, + "learning_rate": 9.017094017094018e-06, + "loss": 6.5428, + "step": 230 + }, + { + "epoch": 1.9743589743589745, + "grad_norm": 17.92879867553711, + "learning_rate": 9.012820512820513e-06, + "loss": 5.986, + "step": 231 + }, + { + "epoch": 1.982905982905983, + "grad_norm": 15.501392364501953, + "learning_rate": 9.00854700854701e-06, + "loss": 5.9526, + "step": 232 + }, + { + "epoch": 1.9914529914529915, + "grad_norm": 23.742633819580078, + "learning_rate": 9.004273504273505e-06, + "loss": 6.2462, + "step": 233 + }, + { + "epoch": 2.0, + "grad_norm": 28.22560691833496, + "learning_rate": 9e-06, + "loss": 5.8705, + "step": 234 + }, + { + "epoch": 2.0, + "eval_loss": 5.379393577575684, + "eval_runtime": 9.2791, + "eval_samples_per_second": 50.22, + "eval_steps_per_second": 6.358, + "step": 234 + }, + { + "epoch": 2.0085470085470085, + "grad_norm": 21.7072696685791, + "learning_rate": 8.995726495726497e-06, + "loss": 6.2757, + "step": 235 + }, + { + "epoch": 2.017094017094017, + "grad_norm": 20.955190658569336, + "learning_rate": 8.991452991452992e-06, + "loss": 5.7265, + "step": 236 + }, + { + "epoch": 2.0256410256410255, + "grad_norm": 15.186567306518555, + "learning_rate": 8.987179487179489e-06, + "loss": 6.1958, + "step": 237 + }, + { + "epoch": 2.034188034188034, + "grad_norm": 20.938766479492188, + "learning_rate": 8.982905982905984e-06, + "loss": 6.2317, + "step": 238 + }, + { + "epoch": 2.0427350427350426, + "grad_norm": 18.457494735717773, + "learning_rate": 8.978632478632479e-06, + "loss": 6.4711, + "step": 239 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 43.505149841308594, + "learning_rate": 8.974358974358976e-06, + "loss": 5.9632, + "step": 240 + }, + { + "epoch": 2.0598290598290596, + "grad_norm": 15.558544158935547, + "learning_rate": 8.97008547008547e-06, + "loss": 5.8099, + "step": 241 + }, + { + "epoch": 2.0683760683760686, + "grad_norm": 22.20660400390625, + "learning_rate": 8.965811965811967e-06, + "loss": 5.7939, + "step": 242 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 15.866617202758789, + "learning_rate": 8.961538461538462e-06, + "loss": 5.9473, + "step": 243 + }, + { + "epoch": 2.0854700854700856, + "grad_norm": 20.30729103088379, + "learning_rate": 8.957264957264959e-06, + "loss": 6.2028, + "step": 244 + }, + { + "epoch": 2.094017094017094, + "grad_norm": 15.517614364624023, + "learning_rate": 8.952991452991454e-06, + "loss": 5.906, + "step": 245 + }, + { + "epoch": 2.1025641025641026, + "grad_norm": 21.30764389038086, + "learning_rate": 8.94871794871795e-06, + "loss": 6.1907, + "step": 246 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 19.973115921020508, + "learning_rate": 8.944444444444446e-06, + "loss": 5.6895, + "step": 247 + }, + { + "epoch": 2.1196581196581197, + "grad_norm": 17.40595817565918, + "learning_rate": 8.940170940170941e-06, + "loss": 5.4836, + "step": 248 + }, + { + "epoch": 2.128205128205128, + "grad_norm": 27.667421340942383, + "learning_rate": 8.935897435897438e-06, + "loss": 5.9082, + "step": 249 + }, + { + "epoch": 2.1367521367521367, + "grad_norm": 18.151315689086914, + "learning_rate": 8.931623931623933e-06, + "loss": 5.8102, + "step": 250 + }, + { + "epoch": 2.1452991452991452, + "grad_norm": 15.390297889709473, + "learning_rate": 8.927350427350428e-06, + "loss": 5.5504, + "step": 251 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 17.257841110229492, + "learning_rate": 8.923076923076925e-06, + "loss": 5.9043, + "step": 252 + }, + { + "epoch": 2.1623931623931623, + "grad_norm": 19.2503604888916, + "learning_rate": 8.91880341880342e-06, + "loss": 5.8349, + "step": 253 + }, + { + "epoch": 2.1709401709401708, + "grad_norm": 25.236759185791016, + "learning_rate": 8.914529914529916e-06, + "loss": 5.2908, + "step": 254 + }, + { + "epoch": 2.1794871794871793, + "grad_norm": 13.771193504333496, + "learning_rate": 8.910256410256411e-06, + "loss": 5.4743, + "step": 255 + }, + { + "epoch": 2.1880341880341883, + "grad_norm": 17.406471252441406, + "learning_rate": 8.905982905982906e-06, + "loss": 5.6856, + "step": 256 + }, + { + "epoch": 2.1965811965811968, + "grad_norm": 14.727091789245605, + "learning_rate": 8.901709401709401e-06, + "loss": 5.7937, + "step": 257 + }, + { + "epoch": 2.2051282051282053, + "grad_norm": 18.193246841430664, + "learning_rate": 8.897435897435898e-06, + "loss": 5.5704, + "step": 258 + }, + { + "epoch": 2.213675213675214, + "grad_norm": 21.573726654052734, + "learning_rate": 8.893162393162393e-06, + "loss": 5.479, + "step": 259 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 28.72640037536621, + "learning_rate": 8.888888888888888e-06, + "loss": 5.5096, + "step": 260 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 15.4992094039917, + "learning_rate": 8.884615384615385e-06, + "loss": 5.217, + "step": 261 + }, + { + "epoch": 2.2393162393162394, + "grad_norm": 17.753416061401367, + "learning_rate": 8.88034188034188e-06, + "loss": 5.8173, + "step": 262 + }, + { + "epoch": 2.247863247863248, + "grad_norm": 15.91961669921875, + "learning_rate": 8.876068376068377e-06, + "loss": 5.7171, + "step": 263 + }, + { + "epoch": 2.2564102564102564, + "grad_norm": 23.30504035949707, + "learning_rate": 8.871794871794872e-06, + "loss": 5.6214, + "step": 264 + }, + { + "epoch": 2.264957264957265, + "grad_norm": 15.583686828613281, + "learning_rate": 8.867521367521369e-06, + "loss": 5.2343, + "step": 265 + }, + { + "epoch": 2.2735042735042734, + "grad_norm": 24.482046127319336, + "learning_rate": 8.863247863247864e-06, + "loss": 5.0747, + "step": 266 + }, + { + "epoch": 2.282051282051282, + "grad_norm": 16.17924690246582, + "learning_rate": 8.858974358974359e-06, + "loss": 5.2645, + "step": 267 + }, + { + "epoch": 2.2905982905982905, + "grad_norm": 19.538314819335938, + "learning_rate": 8.854700854700855e-06, + "loss": 5.3484, + "step": 268 + }, + { + "epoch": 2.299145299145299, + "grad_norm": 14.472186088562012, + "learning_rate": 8.85042735042735e-06, + "loss": 5.8159, + "step": 269 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 16.797805786132812, + "learning_rate": 8.846153846153847e-06, + "loss": 5.4466, + "step": 270 + }, + { + "epoch": 2.316239316239316, + "grad_norm": 13.237580299377441, + "learning_rate": 8.841880341880342e-06, + "loss": 5.2189, + "step": 271 + }, + { + "epoch": 2.324786324786325, + "grad_norm": 16.685317993164062, + "learning_rate": 8.837606837606837e-06, + "loss": 5.7098, + "step": 272 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 16.63880729675293, + "learning_rate": 8.833333333333334e-06, + "loss": 5.0714, + "step": 273 + }, + { + "epoch": 2.341880341880342, + "grad_norm": 20.871978759765625, + "learning_rate": 8.829059829059829e-06, + "loss": 4.9509, + "step": 274 + }, + { + "epoch": 2.3504273504273505, + "grad_norm": 16.95268440246582, + "learning_rate": 8.824786324786326e-06, + "loss": 5.4166, + "step": 275 + }, + { + "epoch": 2.358974358974359, + "grad_norm": 15.446279525756836, + "learning_rate": 8.820512820512821e-06, + "loss": 4.5967, + "step": 276 + }, + { + "epoch": 2.3675213675213675, + "grad_norm": 17.148235321044922, + "learning_rate": 8.816239316239316e-06, + "loss": 5.2542, + "step": 277 + }, + { + "epoch": 2.376068376068376, + "grad_norm": 17.014827728271484, + "learning_rate": 8.811965811965813e-06, + "loss": 5.4702, + "step": 278 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 15.313383102416992, + "learning_rate": 8.807692307692308e-06, + "loss": 5.2119, + "step": 279 + }, + { + "epoch": 2.393162393162393, + "grad_norm": 20.2298641204834, + "learning_rate": 8.803418803418804e-06, + "loss": 5.4064, + "step": 280 + }, + { + "epoch": 2.4017094017094016, + "grad_norm": 14.982254981994629, + "learning_rate": 8.7991452991453e-06, + "loss": 5.2545, + "step": 281 + }, + { + "epoch": 2.41025641025641, + "grad_norm": 16.258047103881836, + "learning_rate": 8.794871794871796e-06, + "loss": 5.0141, + "step": 282 + }, + { + "epoch": 2.4188034188034186, + "grad_norm": 22.5199031829834, + "learning_rate": 8.790598290598291e-06, + "loss": 5.3486, + "step": 283 + }, + { + "epoch": 2.427350427350427, + "grad_norm": 17.546480178833008, + "learning_rate": 8.786324786324786e-06, + "loss": 5.2785, + "step": 284 + }, + { + "epoch": 2.435897435897436, + "grad_norm": 22.07866668701172, + "learning_rate": 8.782051282051283e-06, + "loss": 5.4471, + "step": 285 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 409.2532043457031, + "learning_rate": 8.777777777777778e-06, + "loss": 6.0948, + "step": 286 + }, + { + "epoch": 2.452991452991453, + "grad_norm": 185.7334747314453, + "learning_rate": 8.773504273504275e-06, + "loss": 5.5538, + "step": 287 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 30.8182430267334, + "learning_rate": 8.76923076923077e-06, + "loss": 4.9661, + "step": 288 + }, + { + "epoch": 2.47008547008547, + "grad_norm": 18.584409713745117, + "learning_rate": 8.764957264957265e-06, + "loss": 5.0947, + "step": 289 + }, + { + "epoch": 2.4786324786324787, + "grad_norm": 18.128522872924805, + "learning_rate": 8.760683760683762e-06, + "loss": 4.8816, + "step": 290 + }, + { + "epoch": 2.4871794871794872, + "grad_norm": 18.800090789794922, + "learning_rate": 8.756410256410257e-06, + "loss": 5.0952, + "step": 291 + }, + { + "epoch": 2.4957264957264957, + "grad_norm": 22.140430450439453, + "learning_rate": 8.752136752136753e-06, + "loss": 4.5408, + "step": 292 + }, + { + "epoch": 2.5042735042735043, + "grad_norm": 19.867111206054688, + "learning_rate": 8.747863247863248e-06, + "loss": 4.7435, + "step": 293 + }, + { + "epoch": 2.5128205128205128, + "grad_norm": 19.437868118286133, + "learning_rate": 8.743589743589743e-06, + "loss": 5.2643, + "step": 294 + }, + { + "epoch": 2.5213675213675213, + "grad_norm": 18.256561279296875, + "learning_rate": 8.73931623931624e-06, + "loss": 5.2531, + "step": 295 + }, + { + "epoch": 2.52991452991453, + "grad_norm": 18.65209197998047, + "learning_rate": 8.735042735042735e-06, + "loss": 4.8646, + "step": 296 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 14.704927444458008, + "learning_rate": 8.730769230769232e-06, + "loss": 4.8343, + "step": 297 + }, + { + "epoch": 2.547008547008547, + "grad_norm": 15.522851943969727, + "learning_rate": 8.726495726495727e-06, + "loss": 4.898, + "step": 298 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 21.7825927734375, + "learning_rate": 8.722222222222224e-06, + "loss": 5.0732, + "step": 299 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 17.963552474975586, + "learning_rate": 8.717948717948719e-06, + "loss": 4.9684, + "step": 300 + }, + { + "epoch": 2.5726495726495724, + "grad_norm": 16.14459991455078, + "learning_rate": 8.713675213675214e-06, + "loss": 4.8802, + "step": 301 + }, + { + "epoch": 2.5811965811965814, + "grad_norm": 18.386646270751953, + "learning_rate": 8.70940170940171e-06, + "loss": 4.8837, + "step": 302 + }, + { + "epoch": 2.58974358974359, + "grad_norm": 19.471376419067383, + "learning_rate": 8.705128205128206e-06, + "loss": 4.6325, + "step": 303 + }, + { + "epoch": 2.5982905982905984, + "grad_norm": 17.839717864990234, + "learning_rate": 8.700854700854702e-06, + "loss": 4.7851, + "step": 304 + }, + { + "epoch": 2.606837606837607, + "grad_norm": 26.519363403320312, + "learning_rate": 8.696581196581197e-06, + "loss": 5.0576, + "step": 305 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 14.135244369506836, + "learning_rate": 8.692307692307692e-06, + "loss": 4.7719, + "step": 306 + }, + { + "epoch": 2.623931623931624, + "grad_norm": 16.5241641998291, + "learning_rate": 8.68803418803419e-06, + "loss": 4.5826, + "step": 307 + }, + { + "epoch": 2.6324786324786325, + "grad_norm": 23.982437133789062, + "learning_rate": 8.683760683760684e-06, + "loss": 4.4878, + "step": 308 + }, + { + "epoch": 2.641025641025641, + "grad_norm": 16.036361694335938, + "learning_rate": 8.679487179487181e-06, + "loss": 4.3867, + "step": 309 + }, + { + "epoch": 2.6495726495726495, + "grad_norm": 16.19298553466797, + "learning_rate": 8.675213675213676e-06, + "loss": 4.763, + "step": 310 + }, + { + "epoch": 2.658119658119658, + "grad_norm": 19.32802963256836, + "learning_rate": 8.670940170940171e-06, + "loss": 4.4083, + "step": 311 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 21.75898551940918, + "learning_rate": 8.666666666666668e-06, + "loss": 4.8782, + "step": 312 + }, + { + "epoch": 2.6752136752136755, + "grad_norm": 905.6954956054688, + "learning_rate": 8.662393162393163e-06, + "loss": 5.7901, + "step": 313 + }, + { + "epoch": 2.683760683760684, + "grad_norm": 21.126985549926758, + "learning_rate": 8.65811965811966e-06, + "loss": 4.918, + "step": 314 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 22.190237045288086, + "learning_rate": 8.653846153846155e-06, + "loss": 4.4327, + "step": 315 + }, + { + "epoch": 2.700854700854701, + "grad_norm": 90.69184875488281, + "learning_rate": 8.649572649572651e-06, + "loss": 5.1477, + "step": 316 + }, + { + "epoch": 2.7094017094017095, + "grad_norm": 43.43864059448242, + "learning_rate": 8.645299145299146e-06, + "loss": 4.5476, + "step": 317 + }, + { + "epoch": 2.717948717948718, + "grad_norm": 19.24538230895996, + "learning_rate": 8.641025641025641e-06, + "loss": 4.4304, + "step": 318 + }, + { + "epoch": 2.7264957264957266, + "grad_norm": 21.809600830078125, + "learning_rate": 8.636752136752138e-06, + "loss": 4.4215, + "step": 319 + }, + { + "epoch": 2.735042735042735, + "grad_norm": 21.406156539916992, + "learning_rate": 8.632478632478633e-06, + "loss": 4.5411, + "step": 320 + }, + { + "epoch": 2.7435897435897436, + "grad_norm": 17.57236099243164, + "learning_rate": 8.62820512820513e-06, + "loss": 4.7952, + "step": 321 + }, + { + "epoch": 2.752136752136752, + "grad_norm": 21.049169540405273, + "learning_rate": 8.623931623931625e-06, + "loss": 4.4596, + "step": 322 + }, + { + "epoch": 2.7606837606837606, + "grad_norm": 20.04981803894043, + "learning_rate": 8.61965811965812e-06, + "loss": 4.4705, + "step": 323 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 21.146499633789062, + "learning_rate": 8.615384615384617e-06, + "loss": 4.6081, + "step": 324 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 20.9805908203125, + "learning_rate": 8.611111111111112e-06, + "loss": 4.8387, + "step": 325 + }, + { + "epoch": 2.786324786324786, + "grad_norm": 17.708343505859375, + "learning_rate": 8.606837606837609e-06, + "loss": 4.3455, + "step": 326 + }, + { + "epoch": 2.7948717948717947, + "grad_norm": 25.657032012939453, + "learning_rate": 8.602564102564104e-06, + "loss": 4.3119, + "step": 327 + }, + { + "epoch": 2.8034188034188032, + "grad_norm": 17.713972091674805, + "learning_rate": 8.598290598290599e-06, + "loss": 4.5597, + "step": 328 + }, + { + "epoch": 2.8119658119658117, + "grad_norm": 22.297082901000977, + "learning_rate": 8.594017094017095e-06, + "loss": 3.8398, + "step": 329 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 16.11454200744629, + "learning_rate": 8.58974358974359e-06, + "loss": 3.2049, + "step": 330 + }, + { + "epoch": 2.8290598290598292, + "grad_norm": 27.323585510253906, + "learning_rate": 8.585470085470086e-06, + "loss": 4.0371, + "step": 331 + }, + { + "epoch": 2.8376068376068377, + "grad_norm": 21.090797424316406, + "learning_rate": 8.58119658119658e-06, + "loss": 4.5193, + "step": 332 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 39.087432861328125, + "learning_rate": 8.576923076923077e-06, + "loss": 4.3537, + "step": 333 + }, + { + "epoch": 2.8547008547008548, + "grad_norm": 18.49846839904785, + "learning_rate": 8.572649572649572e-06, + "loss": 4.614, + "step": 334 + }, + { + "epoch": 2.8632478632478633, + "grad_norm": 26.671632766723633, + "learning_rate": 8.568376068376069e-06, + "loss": 4.2224, + "step": 335 + }, + { + "epoch": 2.871794871794872, + "grad_norm": 25.799545288085938, + "learning_rate": 8.564102564102564e-06, + "loss": 4.2209, + "step": 336 + }, + { + "epoch": 2.8803418803418803, + "grad_norm": 20.131961822509766, + "learning_rate": 8.559829059829061e-06, + "loss": 4.5194, + "step": 337 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 20.193859100341797, + "learning_rate": 8.555555555555556e-06, + "loss": 3.9966, + "step": 338 + }, + { + "epoch": 2.8974358974358974, + "grad_norm": 20.06737518310547, + "learning_rate": 8.551282051282051e-06, + "loss": 3.7394, + "step": 339 + }, + { + "epoch": 2.905982905982906, + "grad_norm": 438.34429931640625, + "learning_rate": 8.547008547008548e-06, + "loss": 5.1558, + "step": 340 + }, + { + "epoch": 2.9145299145299144, + "grad_norm": 22.152528762817383, + "learning_rate": 8.542735042735043e-06, + "loss": 3.9014, + "step": 341 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 29.279739379882812, + "learning_rate": 8.53846153846154e-06, + "loss": 4.0479, + "step": 342 + }, + { + "epoch": 2.931623931623932, + "grad_norm": 26.182645797729492, + "learning_rate": 8.534188034188035e-06, + "loss": 4.2022, + "step": 343 + }, + { + "epoch": 2.9401709401709404, + "grad_norm": 22.329736709594727, + "learning_rate": 8.52991452991453e-06, + "loss": 3.8777, + "step": 344 + }, + { + "epoch": 2.948717948717949, + "grad_norm": 20.62833023071289, + "learning_rate": 8.525641025641026e-06, + "loss": 4.2189, + "step": 345 + }, + { + "epoch": 2.9572649572649574, + "grad_norm": 20.176612854003906, + "learning_rate": 8.521367521367521e-06, + "loss": 4.0124, + "step": 346 + }, + { + "epoch": 2.965811965811966, + "grad_norm": 18.77017593383789, + "learning_rate": 8.517094017094018e-06, + "loss": 3.3286, + "step": 347 + }, + { + "epoch": 2.9743589743589745, + "grad_norm": 226.93701171875, + "learning_rate": 8.512820512820513e-06, + "loss": 4.6969, + "step": 348 + }, + { + "epoch": 2.982905982905983, + "grad_norm": 675.1133422851562, + "learning_rate": 8.508547008547008e-06, + "loss": 4.6717, + "step": 349 + }, + { + "epoch": 2.9914529914529915, + "grad_norm": 19.938486099243164, + "learning_rate": 8.504273504273505e-06, + "loss": 4.0103, + "step": 350 + }, + { + "epoch": 3.0, + "grad_norm": 15.917003631591797, + "learning_rate": 8.5e-06, + "loss": 3.1643, + "step": 351 + }, + { + "epoch": 3.0, + "eval_loss": 3.4197537899017334, + "eval_runtime": 9.289, + "eval_samples_per_second": 50.167, + "eval_steps_per_second": 6.352, + "step": 351 + }, + { + "epoch": 3.0085470085470085, + "grad_norm": 22.22833251953125, + "learning_rate": 8.495726495726497e-06, + "loss": 4.3458, + "step": 352 + }, + { + "epoch": 3.017094017094017, + "grad_norm": 16.4627685546875, + "learning_rate": 8.491452991452992e-06, + "loss": 3.5374, + "step": 353 + }, + { + "epoch": 3.0256410256410255, + "grad_norm": 16.389379501342773, + "learning_rate": 8.487179487179488e-06, + "loss": 4.1384, + "step": 354 + }, + { + "epoch": 3.034188034188034, + "grad_norm": 19.589706420898438, + "learning_rate": 8.482905982905983e-06, + "loss": 3.9522, + "step": 355 + }, + { + "epoch": 3.0427350427350426, + "grad_norm": 21.66250228881836, + "learning_rate": 8.478632478632479e-06, + "loss": 4.0197, + "step": 356 + }, + { + "epoch": 3.051282051282051, + "grad_norm": 42.1422119140625, + "learning_rate": 8.474358974358975e-06, + "loss": 3.9432, + "step": 357 + }, + { + "epoch": 3.0598290598290596, + "grad_norm": 23.0153751373291, + "learning_rate": 8.47008547008547e-06, + "loss": 3.9146, + "step": 358 + }, + { + "epoch": 3.0683760683760686, + "grad_norm": 20.847400665283203, + "learning_rate": 8.465811965811967e-06, + "loss": 3.9736, + "step": 359 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 23.553855895996094, + "learning_rate": 8.461538461538462e-06, + "loss": 3.646, + "step": 360 + }, + { + "epoch": 3.0854700854700856, + "grad_norm": 18.651151657104492, + "learning_rate": 8.457264957264957e-06, + "loss": 3.761, + "step": 361 + }, + { + "epoch": 3.094017094017094, + "grad_norm": 23.437379837036133, + "learning_rate": 8.452991452991454e-06, + "loss": 3.9258, + "step": 362 + }, + { + "epoch": 3.1025641025641026, + "grad_norm": 19.025928497314453, + "learning_rate": 8.448717948717949e-06, + "loss": 3.4911, + "step": 363 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 25.955963134765625, + "learning_rate": 8.444444444444446e-06, + "loss": 3.7231, + "step": 364 + }, + { + "epoch": 3.1196581196581197, + "grad_norm": 19.691673278808594, + "learning_rate": 8.44017094017094e-06, + "loss": 3.9225, + "step": 365 + }, + { + "epoch": 3.128205128205128, + "grad_norm": 19.47168731689453, + "learning_rate": 8.435897435897436e-06, + "loss": 3.6261, + "step": 366 + }, + { + "epoch": 3.1367521367521367, + "grad_norm": 20.50010108947754, + "learning_rate": 8.431623931623932e-06, + "loss": 3.3306, + "step": 367 + }, + { + "epoch": 3.1452991452991452, + "grad_norm": 21.198938369750977, + "learning_rate": 8.427350427350428e-06, + "loss": 3.6388, + "step": 368 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 16.93203353881836, + "learning_rate": 8.423076923076924e-06, + "loss": 3.9556, + "step": 369 + }, + { + "epoch": 3.1623931623931623, + "grad_norm": 15.074128150939941, + "learning_rate": 8.41880341880342e-06, + "loss": 2.9899, + "step": 370 + }, + { + "epoch": 3.1709401709401708, + "grad_norm": 23.041452407836914, + "learning_rate": 8.414529914529916e-06, + "loss": 3.291, + "step": 371 + }, + { + "epoch": 3.1794871794871793, + "grad_norm": 24.146419525146484, + "learning_rate": 8.410256410256411e-06, + "loss": 4.0683, + "step": 372 + }, + { + "epoch": 3.1880341880341883, + "grad_norm": 27.864879608154297, + "learning_rate": 8.405982905982906e-06, + "loss": 3.6171, + "step": 373 + }, + { + "epoch": 3.1965811965811968, + "grad_norm": 33.83136749267578, + "learning_rate": 8.401709401709403e-06, + "loss": 3.7324, + "step": 374 + }, + { + "epoch": 3.2051282051282053, + "grad_norm": 21.020702362060547, + "learning_rate": 8.397435897435898e-06, + "loss": 3.5688, + "step": 375 + }, + { + "epoch": 3.213675213675214, + "grad_norm": 23.521453857421875, + "learning_rate": 8.393162393162395e-06, + "loss": 3.6917, + "step": 376 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 35.85578536987305, + "learning_rate": 8.38888888888889e-06, + "loss": 3.6532, + "step": 377 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 26.080968856811523, + "learning_rate": 8.384615384615385e-06, + "loss": 3.8828, + "step": 378 + }, + { + "epoch": 3.2393162393162394, + "grad_norm": 20.829381942749023, + "learning_rate": 8.380341880341881e-06, + "loss": 3.8374, + "step": 379 + }, + { + "epoch": 3.247863247863248, + "grad_norm": 20.85077476501465, + "learning_rate": 8.376068376068377e-06, + "loss": 3.2896, + "step": 380 + }, + { + "epoch": 3.2564102564102564, + "grad_norm": 19.036088943481445, + "learning_rate": 8.371794871794873e-06, + "loss": 3.4996, + "step": 381 + }, + { + "epoch": 3.264957264957265, + "grad_norm": 23.725513458251953, + "learning_rate": 8.367521367521368e-06, + "loss": 3.7686, + "step": 382 + }, + { + "epoch": 3.2735042735042734, + "grad_norm": 22.553386688232422, + "learning_rate": 8.363247863247865e-06, + "loss": 3.8476, + "step": 383 + }, + { + "epoch": 3.282051282051282, + "grad_norm": 20.263992309570312, + "learning_rate": 8.35897435897436e-06, + "loss": 3.3278, + "step": 384 + }, + { + "epoch": 3.2905982905982905, + "grad_norm": 22.47858238220215, + "learning_rate": 8.354700854700855e-06, + "loss": 3.5437, + "step": 385 + }, + { + "epoch": 3.299145299145299, + "grad_norm": 24.14532470703125, + "learning_rate": 8.350427350427352e-06, + "loss": 3.696, + "step": 386 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 31.457847595214844, + "learning_rate": 8.346153846153847e-06, + "loss": 4.3065, + "step": 387 + }, + { + "epoch": 3.316239316239316, + "grad_norm": 24.503095626831055, + "learning_rate": 8.341880341880344e-06, + "loss": 3.4798, + "step": 388 + }, + { + "epoch": 3.324786324786325, + "grad_norm": 19.798818588256836, + "learning_rate": 8.337606837606839e-06, + "loss": 3.5323, + "step": 389 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 22.023189544677734, + "learning_rate": 8.333333333333334e-06, + "loss": 3.4088, + "step": 390 + }, + { + "epoch": 3.341880341880342, + "grad_norm": 17.314960479736328, + "learning_rate": 8.32905982905983e-06, + "loss": 3.2462, + "step": 391 + }, + { + "epoch": 3.3504273504273505, + "grad_norm": 22.714536666870117, + "learning_rate": 8.324786324786326e-06, + "loss": 3.7863, + "step": 392 + }, + { + "epoch": 3.358974358974359, + "grad_norm": 27.710514068603516, + "learning_rate": 8.320512820512822e-06, + "loss": 3.6032, + "step": 393 + }, + { + "epoch": 3.3675213675213675, + "grad_norm": 23.35419464111328, + "learning_rate": 8.316239316239317e-06, + "loss": 3.5599, + "step": 394 + }, + { + "epoch": 3.376068376068376, + "grad_norm": 24.0956974029541, + "learning_rate": 8.311965811965812e-06, + "loss": 3.5186, + "step": 395 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 22.09107780456543, + "learning_rate": 8.307692307692309e-06, + "loss": 3.4843, + "step": 396 + }, + { + "epoch": 3.393162393162393, + "grad_norm": 23.956623077392578, + "learning_rate": 8.303418803418804e-06, + "loss": 3.1625, + "step": 397 + }, + { + "epoch": 3.4017094017094016, + "grad_norm": 18.875917434692383, + "learning_rate": 8.299145299145301e-06, + "loss": 3.3494, + "step": 398 + }, + { + "epoch": 3.41025641025641, + "grad_norm": 33.475467681884766, + "learning_rate": 8.294871794871796e-06, + "loss": 3.9247, + "step": 399 + }, + { + "epoch": 3.4188034188034186, + "grad_norm": 16.28295135498047, + "learning_rate": 8.290598290598293e-06, + "loss": 3.7446, + "step": 400 + }, + { + "epoch": 3.427350427350427, + "grad_norm": 24.205049514770508, + "learning_rate": 8.286324786324788e-06, + "loss": 3.343, + "step": 401 + }, + { + "epoch": 3.435897435897436, + "grad_norm": 21.21460723876953, + "learning_rate": 8.282051282051283e-06, + "loss": 3.2437, + "step": 402 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 36.8713264465332, + "learning_rate": 8.277777777777778e-06, + "loss": 3.5009, + "step": 403 + }, + { + "epoch": 3.452991452991453, + "grad_norm": 26.85513687133789, + "learning_rate": 8.273504273504273e-06, + "loss": 3.7271, + "step": 404 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 18.184600830078125, + "learning_rate": 8.26923076923077e-06, + "loss": 3.2216, + "step": 405 + }, + { + "epoch": 3.47008547008547, + "grad_norm": 27.03692054748535, + "learning_rate": 8.264957264957265e-06, + "loss": 3.516, + "step": 406 + }, + { + "epoch": 3.4786324786324787, + "grad_norm": 20.63736915588379, + "learning_rate": 8.260683760683761e-06, + "loss": 3.1349, + "step": 407 + }, + { + "epoch": 3.4871794871794872, + "grad_norm": 22.467845916748047, + "learning_rate": 8.256410256410256e-06, + "loss": 3.3878, + "step": 408 + }, + { + "epoch": 3.4957264957264957, + "grad_norm": 21.25887107849121, + "learning_rate": 8.252136752136753e-06, + "loss": 3.8298, + "step": 409 + }, + { + "epoch": 3.5042735042735043, + "grad_norm": 47.3256721496582, + "learning_rate": 8.247863247863248e-06, + "loss": 3.5321, + "step": 410 + }, + { + "epoch": 3.5128205128205128, + "grad_norm": 22.103790283203125, + "learning_rate": 8.243589743589743e-06, + "loss": 3.335, + "step": 411 + }, + { + "epoch": 3.5213675213675213, + "grad_norm": 25.779077529907227, + "learning_rate": 8.23931623931624e-06, + "loss": 3.5047, + "step": 412 + }, + { + "epoch": 3.52991452991453, + "grad_norm": 22.78207778930664, + "learning_rate": 8.235042735042735e-06, + "loss": 3.3827, + "step": 413 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 22.41836166381836, + "learning_rate": 8.230769230769232e-06, + "loss": 3.4521, + "step": 414 + }, + { + "epoch": 3.547008547008547, + "grad_norm": 60.29216384887695, + "learning_rate": 8.226495726495727e-06, + "loss": 3.4598, + "step": 415 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 25.27474021911621, + "learning_rate": 8.222222222222222e-06, + "loss": 3.7443, + "step": 416 + }, + { + "epoch": 3.564102564102564, + "grad_norm": 25.297466278076172, + "learning_rate": 8.217948717948719e-06, + "loss": 3.3123, + "step": 417 + }, + { + "epoch": 3.5726495726495724, + "grad_norm": 28.5858154296875, + "learning_rate": 8.213675213675214e-06, + "loss": 3.1801, + "step": 418 + }, + { + "epoch": 3.5811965811965814, + "grad_norm": 20.05567741394043, + "learning_rate": 8.20940170940171e-06, + "loss": 3.7242, + "step": 419 + }, + { + "epoch": 3.58974358974359, + "grad_norm": 32.33693313598633, + "learning_rate": 8.205128205128205e-06, + "loss": 3.3587, + "step": 420 + }, + { + "epoch": 3.5982905982905984, + "grad_norm": 36.1716194152832, + "learning_rate": 8.200854700854702e-06, + "loss": 3.1573, + "step": 421 + }, + { + "epoch": 3.606837606837607, + "grad_norm": 33.39027404785156, + "learning_rate": 8.196581196581197e-06, + "loss": 3.098, + "step": 422 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 28.4794864654541, + "learning_rate": 8.192307692307692e-06, + "loss": 3.6403, + "step": 423 + }, + { + "epoch": 3.623931623931624, + "grad_norm": 29.702611923217773, + "learning_rate": 8.188034188034189e-06, + "loss": 3.2569, + "step": 424 + }, + { + "epoch": 3.6324786324786325, + "grad_norm": 24.73663902282715, + "learning_rate": 8.183760683760684e-06, + "loss": 3.0508, + "step": 425 + }, + { + "epoch": 3.641025641025641, + "grad_norm": 29.606807708740234, + "learning_rate": 8.17948717948718e-06, + "loss": 3.2524, + "step": 426 + }, + { + "epoch": 3.6495726495726495, + "grad_norm": 22.721933364868164, + "learning_rate": 8.175213675213676e-06, + "loss": 3.2583, + "step": 427 + }, + { + "epoch": 3.658119658119658, + "grad_norm": 25.009403228759766, + "learning_rate": 8.17094017094017e-06, + "loss": 3.0678, + "step": 428 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 25.776636123657227, + "learning_rate": 8.166666666666668e-06, + "loss": 3.1676, + "step": 429 + }, + { + "epoch": 3.6752136752136755, + "grad_norm": 28.210241317749023, + "learning_rate": 8.162393162393163e-06, + "loss": 3.2869, + "step": 430 + }, + { + "epoch": 3.683760683760684, + "grad_norm": 26.29328155517578, + "learning_rate": 8.15811965811966e-06, + "loss": 3.3618, + "step": 431 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 19.813465118408203, + "learning_rate": 8.153846153846154e-06, + "loss": 3.0655, + "step": 432 + }, + { + "epoch": 3.700854700854701, + "grad_norm": 29.718812942504883, + "learning_rate": 8.14957264957265e-06, + "loss": 3.1538, + "step": 433 + }, + { + "epoch": 3.7094017094017095, + "grad_norm": 30.629135131835938, + "learning_rate": 8.145299145299146e-06, + "loss": 3.3252, + "step": 434 + }, + { + "epoch": 3.717948717948718, + "grad_norm": 27.716825485229492, + "learning_rate": 8.141025641025641e-06, + "loss": 3.4083, + "step": 435 + }, + { + "epoch": 3.7264957264957266, + "grad_norm": 39.23820877075195, + "learning_rate": 8.136752136752138e-06, + "loss": 3.3074, + "step": 436 + }, + { + "epoch": 3.735042735042735, + "grad_norm": 34.516422271728516, + "learning_rate": 8.132478632478633e-06, + "loss": 3.3529, + "step": 437 + }, + { + "epoch": 3.7435897435897436, + "grad_norm": 41.98606872558594, + "learning_rate": 8.12820512820513e-06, + "loss": 3.248, + "step": 438 + }, + { + "epoch": 3.752136752136752, + "grad_norm": 27.99711799621582, + "learning_rate": 8.123931623931625e-06, + "loss": 3.3054, + "step": 439 + }, + { + "epoch": 3.7606837606837606, + "grad_norm": 25.21969985961914, + "learning_rate": 8.11965811965812e-06, + "loss": 2.8518, + "step": 440 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 29.14298439025879, + "learning_rate": 8.115384615384617e-06, + "loss": 3.0063, + "step": 441 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 27.040063858032227, + "learning_rate": 8.111111111111112e-06, + "loss": 3.3066, + "step": 442 + }, + { + "epoch": 3.786324786324786, + "grad_norm": 365.3290100097656, + "learning_rate": 8.106837606837608e-06, + "loss": 3.8057, + "step": 443 + }, + { + "epoch": 3.7948717948717947, + "grad_norm": 32.89745330810547, + "learning_rate": 8.102564102564103e-06, + "loss": 3.0903, + "step": 444 + }, + { + "epoch": 3.8034188034188032, + "grad_norm": 29.448022842407227, + "learning_rate": 8.098290598290598e-06, + "loss": 3.2723, + "step": 445 + }, + { + "epoch": 3.8119658119658117, + "grad_norm": 27.838123321533203, + "learning_rate": 8.094017094017095e-06, + "loss": 3.2903, + "step": 446 + }, + { + "epoch": 3.8205128205128203, + "grad_norm": 29.047847747802734, + "learning_rate": 8.08974358974359e-06, + "loss": 2.9048, + "step": 447 + }, + { + "epoch": 3.8290598290598292, + "grad_norm": 28.666589736938477, + "learning_rate": 8.085470085470087e-06, + "loss": 3.2186, + "step": 448 + }, + { + "epoch": 3.8376068376068377, + "grad_norm": 31.796804428100586, + "learning_rate": 8.081196581196582e-06, + "loss": 3.2668, + "step": 449 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 22.665220260620117, + "learning_rate": 8.076923076923077e-06, + "loss": 3.0965, + "step": 450 + }, + { + "epoch": 3.8547008547008548, + "grad_norm": 32.7353630065918, + "learning_rate": 8.072649572649574e-06, + "loss": 3.1759, + "step": 451 + }, + { + "epoch": 3.8632478632478633, + "grad_norm": 32.95683670043945, + "learning_rate": 8.068376068376069e-06, + "loss": 2.9589, + "step": 452 + }, + { + "epoch": 3.871794871794872, + "grad_norm": 30.04659652709961, + "learning_rate": 8.064102564102566e-06, + "loss": 3.4709, + "step": 453 + }, + { + "epoch": 3.8803418803418803, + "grad_norm": 30.41158676147461, + "learning_rate": 8.05982905982906e-06, + "loss": 2.9385, + "step": 454 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 30.059635162353516, + "learning_rate": 8.055555555555557e-06, + "loss": 3.0099, + "step": 455 + }, + { + "epoch": 3.8974358974358974, + "grad_norm": 24.83198356628418, + "learning_rate": 8.051282051282052e-06, + "loss": 2.9783, + "step": 456 + }, + { + "epoch": 3.905982905982906, + "grad_norm": 25.38758087158203, + "learning_rate": 8.047008547008547e-06, + "loss": 3.0275, + "step": 457 + }, + { + "epoch": 3.9145299145299144, + "grad_norm": 25.21868133544922, + "learning_rate": 8.042735042735044e-06, + "loss": 2.9096, + "step": 458 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 32.02922058105469, + "learning_rate": 8.03846153846154e-06, + "loss": 3.059, + "step": 459 + }, + { + "epoch": 3.931623931623932, + "grad_norm": 22.240680694580078, + "learning_rate": 8.034188034188036e-06, + "loss": 2.9473, + "step": 460 + }, + { + "epoch": 3.9401709401709404, + "grad_norm": 27.61838150024414, + "learning_rate": 8.029914529914531e-06, + "loss": 2.4506, + "step": 461 + }, + { + "epoch": 3.948717948717949, + "grad_norm": 27.742216110229492, + "learning_rate": 8.025641025641026e-06, + "loss": 2.9082, + "step": 462 + }, + { + "epoch": 3.9572649572649574, + "grad_norm": 29.965059280395508, + "learning_rate": 8.021367521367523e-06, + "loss": 2.8268, + "step": 463 + }, + { + "epoch": 3.965811965811966, + "grad_norm": 31.429990768432617, + "learning_rate": 8.017094017094018e-06, + "loss": 3.1805, + "step": 464 + }, + { + "epoch": 3.9743589743589745, + "grad_norm": 31.162532806396484, + "learning_rate": 8.012820512820515e-06, + "loss": 2.64, + "step": 465 + }, + { + "epoch": 3.982905982905983, + "grad_norm": 28.240577697753906, + "learning_rate": 8.00854700854701e-06, + "loss": 3.249, + "step": 466 + }, + { + "epoch": 3.9914529914529915, + "grad_norm": 48.52914810180664, + "learning_rate": 8.004273504273505e-06, + "loss": 3.1619, + "step": 467 + }, + { + "epoch": 4.0, + "grad_norm": 36.80685806274414, + "learning_rate": 8.000000000000001e-06, + "loss": 3.5337, + "step": 468 + }, + { + "epoch": 4.0, + "eval_loss": 2.1340389251708984, + "eval_runtime": 9.2211, + "eval_samples_per_second": 50.536, + "eval_steps_per_second": 6.398, + "step": 468 + }, + { + "epoch": 4.0085470085470085, + "grad_norm": 45.45211410522461, + "learning_rate": 7.995726495726496e-06, + "loss": 3.5596, + "step": 469 + }, + { + "epoch": 4.017094017094017, + "grad_norm": 32.711669921875, + "learning_rate": 7.991452991452993e-06, + "loss": 2.9362, + "step": 470 + }, + { + "epoch": 4.0256410256410255, + "grad_norm": 26.151872634887695, + "learning_rate": 7.987179487179488e-06, + "loss": 2.6796, + "step": 471 + }, + { + "epoch": 4.034188034188034, + "grad_norm": 33.02329635620117, + "learning_rate": 7.982905982905985e-06, + "loss": 2.7147, + "step": 472 + }, + { + "epoch": 4.042735042735043, + "grad_norm": 31.1684513092041, + "learning_rate": 7.97863247863248e-06, + "loss": 3.2356, + "step": 473 + }, + { + "epoch": 4.051282051282051, + "grad_norm": 37.0435905456543, + "learning_rate": 7.974358974358975e-06, + "loss": 2.9954, + "step": 474 + }, + { + "epoch": 4.05982905982906, + "grad_norm": 25.989973068237305, + "learning_rate": 7.970085470085472e-06, + "loss": 3.2143, + "step": 475 + }, + { + "epoch": 4.068376068376068, + "grad_norm": 27.048690795898438, + "learning_rate": 7.965811965811967e-06, + "loss": 2.5087, + "step": 476 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 26.857696533203125, + "learning_rate": 7.961538461538462e-06, + "loss": 2.6466, + "step": 477 + }, + { + "epoch": 4.085470085470085, + "grad_norm": 33.342193603515625, + "learning_rate": 7.957264957264957e-06, + "loss": 2.6591, + "step": 478 + }, + { + "epoch": 4.094017094017094, + "grad_norm": 64.21253967285156, + "learning_rate": 7.952991452991454e-06, + "loss": 3.0295, + "step": 479 + }, + { + "epoch": 4.102564102564102, + "grad_norm": 31.240161895751953, + "learning_rate": 7.948717948717949e-06, + "loss": 2.9374, + "step": 480 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 29.338851928710938, + "learning_rate": 7.944444444444445e-06, + "loss": 2.5019, + "step": 481 + }, + { + "epoch": 4.119658119658119, + "grad_norm": 36.79518127441406, + "learning_rate": 7.94017094017094e-06, + "loss": 2.7649, + "step": 482 + }, + { + "epoch": 4.128205128205128, + "grad_norm": 37.036739349365234, + "learning_rate": 7.935897435897435e-06, + "loss": 2.5182, + "step": 483 + }, + { + "epoch": 4.136752136752137, + "grad_norm": 42.571163177490234, + "learning_rate": 7.931623931623932e-06, + "loss": 2.767, + "step": 484 + }, + { + "epoch": 4.145299145299146, + "grad_norm": 33.72893524169922, + "learning_rate": 7.927350427350427e-06, + "loss": 3.1404, + "step": 485 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 27.06032943725586, + "learning_rate": 7.923076923076924e-06, + "loss": 2.6825, + "step": 486 + }, + { + "epoch": 4.162393162393163, + "grad_norm": 31.8147029876709, + "learning_rate": 7.918803418803419e-06, + "loss": 2.5129, + "step": 487 + }, + { + "epoch": 4.170940170940171, + "grad_norm": 35.681793212890625, + "learning_rate": 7.914529914529914e-06, + "loss": 2.4793, + "step": 488 + }, + { + "epoch": 4.17948717948718, + "grad_norm": 159.4467315673828, + "learning_rate": 7.91025641025641e-06, + "loss": 3.5531, + "step": 489 + }, + { + "epoch": 4.188034188034188, + "grad_norm": 40.12252426147461, + "learning_rate": 7.905982905982906e-06, + "loss": 2.7095, + "step": 490 + }, + { + "epoch": 4.196581196581197, + "grad_norm": 27.05786895751953, + "learning_rate": 7.901709401709403e-06, + "loss": 2.5984, + "step": 491 + }, + { + "epoch": 4.205128205128205, + "grad_norm": 24.31035614013672, + "learning_rate": 7.897435897435898e-06, + "loss": 2.89, + "step": 492 + }, + { + "epoch": 4.213675213675214, + "grad_norm": 277.16156005859375, + "learning_rate": 7.893162393162394e-06, + "loss": 3.8076, + "step": 493 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 29.722867965698242, + "learning_rate": 7.88888888888889e-06, + "loss": 2.4189, + "step": 494 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 40.47605514526367, + "learning_rate": 7.884615384615384e-06, + "loss": 2.6225, + "step": 495 + }, + { + "epoch": 4.239316239316239, + "grad_norm": 29.136499404907227, + "learning_rate": 7.880341880341881e-06, + "loss": 2.5223, + "step": 496 + }, + { + "epoch": 4.247863247863248, + "grad_norm": 78.86258697509766, + "learning_rate": 7.876068376068376e-06, + "loss": 2.6587, + "step": 497 + }, + { + "epoch": 4.256410256410256, + "grad_norm": 24.473243713378906, + "learning_rate": 7.871794871794873e-06, + "loss": 2.456, + "step": 498 + }, + { + "epoch": 4.264957264957265, + "grad_norm": 80.45248413085938, + "learning_rate": 7.867521367521368e-06, + "loss": 3.1893, + "step": 499 + }, + { + "epoch": 4.273504273504273, + "grad_norm": 194.2708282470703, + "learning_rate": 7.863247863247863e-06, + "loss": 3.8294, + "step": 500 + }, + { + "epoch": 4.282051282051282, + "grad_norm": 27.74302101135254, + "learning_rate": 7.85897435897436e-06, + "loss": 2.2506, + "step": 501 + }, + { + "epoch": 4.2905982905982905, + "grad_norm": 21.90385627746582, + "learning_rate": 7.854700854700855e-06, + "loss": 3.0985, + "step": 502 + }, + { + "epoch": 4.299145299145299, + "grad_norm": 50.30342102050781, + "learning_rate": 7.850427350427352e-06, + "loss": 2.526, + "step": 503 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 28.666881561279297, + "learning_rate": 7.846153846153847e-06, + "loss": 2.4213, + "step": 504 + }, + { + "epoch": 4.316239316239316, + "grad_norm": 27.927257537841797, + "learning_rate": 7.841880341880342e-06, + "loss": 2.6731, + "step": 505 + }, + { + "epoch": 4.3247863247863245, + "grad_norm": 36.12032699584961, + "learning_rate": 7.837606837606838e-06, + "loss": 2.3323, + "step": 506 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 31.632287979125977, + "learning_rate": 7.833333333333333e-06, + "loss": 2.2966, + "step": 507 + }, + { + "epoch": 4.3418803418803416, + "grad_norm": 26.511537551879883, + "learning_rate": 7.82905982905983e-06, + "loss": 2.3422, + "step": 508 + }, + { + "epoch": 4.35042735042735, + "grad_norm": 31.429107666015625, + "learning_rate": 7.824786324786325e-06, + "loss": 2.6764, + "step": 509 + }, + { + "epoch": 4.358974358974359, + "grad_norm": 29.8817138671875, + "learning_rate": 7.820512820512822e-06, + "loss": 2.4358, + "step": 510 + }, + { + "epoch": 4.367521367521368, + "grad_norm": 29.293964385986328, + "learning_rate": 7.816239316239317e-06, + "loss": 2.504, + "step": 511 + }, + { + "epoch": 4.3760683760683765, + "grad_norm": 23.624290466308594, + "learning_rate": 7.811965811965812e-06, + "loss": 2.0312, + "step": 512 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 25.336505889892578, + "learning_rate": 7.807692307692309e-06, + "loss": 2.1045, + "step": 513 + }, + { + "epoch": 4.3931623931623935, + "grad_norm": 24.755443572998047, + "learning_rate": 7.803418803418804e-06, + "loss": 2.5754, + "step": 514 + }, + { + "epoch": 4.401709401709402, + "grad_norm": 29.29696273803711, + "learning_rate": 7.7991452991453e-06, + "loss": 2.562, + "step": 515 + }, + { + "epoch": 4.410256410256411, + "grad_norm": 28.054868698120117, + "learning_rate": 7.794871794871796e-06, + "loss": 1.9815, + "step": 516 + }, + { + "epoch": 4.418803418803419, + "grad_norm": 20.894853591918945, + "learning_rate": 7.79059829059829e-06, + "loss": 2.5668, + "step": 517 + }, + { + "epoch": 4.427350427350428, + "grad_norm": 19.532094955444336, + "learning_rate": 7.786324786324787e-06, + "loss": 2.2314, + "step": 518 + }, + { + "epoch": 4.435897435897436, + "grad_norm": 27.919715881347656, + "learning_rate": 7.782051282051282e-06, + "loss": 1.9523, + "step": 519 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 21.91543960571289, + "learning_rate": 7.77777777777778e-06, + "loss": 2.559, + "step": 520 + }, + { + "epoch": 4.452991452991453, + "grad_norm": 26.20106315612793, + "learning_rate": 7.773504273504274e-06, + "loss": 2.367, + "step": 521 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 23.455419540405273, + "learning_rate": 7.76923076923077e-06, + "loss": 2.4132, + "step": 522 + }, + { + "epoch": 4.47008547008547, + "grad_norm": 49.62391662597656, + "learning_rate": 7.764957264957266e-06, + "loss": 1.8896, + "step": 523 + }, + { + "epoch": 4.478632478632479, + "grad_norm": 25.721101760864258, + "learning_rate": 7.760683760683761e-06, + "loss": 1.9918, + "step": 524 + }, + { + "epoch": 4.487179487179487, + "grad_norm": 22.906694412231445, + "learning_rate": 7.756410256410258e-06, + "loss": 2.1819, + "step": 525 + }, + { + "epoch": 4.495726495726496, + "grad_norm": 28.5809268951416, + "learning_rate": 7.752136752136753e-06, + "loss": 2.0516, + "step": 526 + }, + { + "epoch": 4.504273504273504, + "grad_norm": 26.47665023803711, + "learning_rate": 7.74786324786325e-06, + "loss": 2.0081, + "step": 527 + }, + { + "epoch": 4.512820512820513, + "grad_norm": 27.221372604370117, + "learning_rate": 7.743589743589745e-06, + "loss": 2.0414, + "step": 528 + }, + { + "epoch": 4.521367521367521, + "grad_norm": 27.931568145751953, + "learning_rate": 7.73931623931624e-06, + "loss": 2.0335, + "step": 529 + }, + { + "epoch": 4.52991452991453, + "grad_norm": 25.567049026489258, + "learning_rate": 7.735042735042736e-06, + "loss": 2.0129, + "step": 530 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 30.897083282470703, + "learning_rate": 7.730769230769231e-06, + "loss": 2.3941, + "step": 531 + }, + { + "epoch": 4.547008547008547, + "grad_norm": 21.92133903503418, + "learning_rate": 7.726495726495728e-06, + "loss": 2.2563, + "step": 532 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 27.053892135620117, + "learning_rate": 7.722222222222223e-06, + "loss": 2.2463, + "step": 533 + }, + { + "epoch": 4.564102564102564, + "grad_norm": 29.3230037689209, + "learning_rate": 7.717948717948718e-06, + "loss": 1.9167, + "step": 534 + }, + { + "epoch": 4.572649572649572, + "grad_norm": 36.06028747558594, + "learning_rate": 7.713675213675215e-06, + "loss": 1.9106, + "step": 535 + }, + { + "epoch": 4.581196581196581, + "grad_norm": 24.622135162353516, + "learning_rate": 7.70940170940171e-06, + "loss": 2.2899, + "step": 536 + }, + { + "epoch": 4.589743589743589, + "grad_norm": 21.3137264251709, + "learning_rate": 7.705128205128207e-06, + "loss": 2.0166, + "step": 537 + }, + { + "epoch": 4.598290598290598, + "grad_norm": 21.939279556274414, + "learning_rate": 7.700854700854702e-06, + "loss": 2.3319, + "step": 538 + }, + { + "epoch": 4.6068376068376065, + "grad_norm": 25.496994018554688, + "learning_rate": 7.696581196581197e-06, + "loss": 2.6162, + "step": 539 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 24.095666885375977, + "learning_rate": 7.692307692307694e-06, + "loss": 2.2863, + "step": 540 + }, + { + "epoch": 4.6239316239316235, + "grad_norm": 31.96511459350586, + "learning_rate": 7.688034188034189e-06, + "loss": 2.0261, + "step": 541 + }, + { + "epoch": 4.632478632478632, + "grad_norm": 22.66115379333496, + "learning_rate": 7.683760683760685e-06, + "loss": 2.2786, + "step": 542 + }, + { + "epoch": 4.641025641025641, + "grad_norm": 23.661611557006836, + "learning_rate": 7.67948717948718e-06, + "loss": 1.7113, + "step": 543 + }, + { + "epoch": 4.64957264957265, + "grad_norm": 18.64708709716797, + "learning_rate": 7.675213675213677e-06, + "loss": 2.1389, + "step": 544 + }, + { + "epoch": 4.6581196581196584, + "grad_norm": 20.55480194091797, + "learning_rate": 7.670940170940172e-06, + "loss": 2.0831, + "step": 545 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 27.876964569091797, + "learning_rate": 7.666666666666667e-06, + "loss": 2.0358, + "step": 546 + }, + { + "epoch": 4.6752136752136755, + "grad_norm": 20.236507415771484, + "learning_rate": 7.662393162393164e-06, + "loss": 1.5596, + "step": 547 + }, + { + "epoch": 4.683760683760684, + "grad_norm": 23.360782623291016, + "learning_rate": 7.658119658119659e-06, + "loss": 1.9623, + "step": 548 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 41.7568359375, + "learning_rate": 7.653846153846154e-06, + "loss": 1.9884, + "step": 549 + }, + { + "epoch": 4.700854700854701, + "grad_norm": 28.651065826416016, + "learning_rate": 7.649572649572649e-06, + "loss": 2.1491, + "step": 550 + }, + { + "epoch": 4.7094017094017095, + "grad_norm": 23.636432647705078, + "learning_rate": 7.645299145299146e-06, + "loss": 1.9352, + "step": 551 + }, + { + "epoch": 4.717948717948718, + "grad_norm": 25.313966751098633, + "learning_rate": 7.641025641025641e-06, + "loss": 2.4112, + "step": 552 + }, + { + "epoch": 4.726495726495727, + "grad_norm": 32.4974479675293, + "learning_rate": 7.636752136752138e-06, + "loss": 1.7017, + "step": 553 + }, + { + "epoch": 4.735042735042735, + "grad_norm": 20.644481658935547, + "learning_rate": 7.632478632478633e-06, + "loss": 1.6904, + "step": 554 + }, + { + "epoch": 4.743589743589744, + "grad_norm": 26.526721954345703, + "learning_rate": 7.6282051282051286e-06, + "loss": 2.1666, + "step": 555 + }, + { + "epoch": 4.752136752136752, + "grad_norm": 23.375839233398438, + "learning_rate": 7.6239316239316244e-06, + "loss": 1.5555, + "step": 556 + }, + { + "epoch": 4.760683760683761, + "grad_norm": 29.890501022338867, + "learning_rate": 7.6196581196581195e-06, + "loss": 2.0195, + "step": 557 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 687.5745239257812, + "learning_rate": 7.615384615384615e-06, + "loss": 2.4286, + "step": 558 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 22.844587326049805, + "learning_rate": 7.611111111111111e-06, + "loss": 2.2335, + "step": 559 + }, + { + "epoch": 4.786324786324786, + "grad_norm": 29.633562088012695, + "learning_rate": 7.606837606837607e-06, + "loss": 1.7579, + "step": 560 + }, + { + "epoch": 4.794871794871795, + "grad_norm": 48.04582977294922, + "learning_rate": 7.602564102564103e-06, + "loss": 2.3846, + "step": 561 + }, + { + "epoch": 4.803418803418803, + "grad_norm": 27.2290096282959, + "learning_rate": 7.598290598290599e-06, + "loss": 2.2234, + "step": 562 + }, + { + "epoch": 4.811965811965812, + "grad_norm": 29.782209396362305, + "learning_rate": 7.594017094017094e-06, + "loss": 2.0365, + "step": 563 + }, + { + "epoch": 4.82051282051282, + "grad_norm": 32.457061767578125, + "learning_rate": 7.58974358974359e-06, + "loss": 2.0451, + "step": 564 + }, + { + "epoch": 4.829059829059829, + "grad_norm": 22.089427947998047, + "learning_rate": 7.585470085470086e-06, + "loss": 1.7105, + "step": 565 + }, + { + "epoch": 4.837606837606837, + "grad_norm": 23.105140686035156, + "learning_rate": 7.581196581196582e-06, + "loss": 1.6817, + "step": 566 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 24.513713836669922, + "learning_rate": 7.5769230769230775e-06, + "loss": 1.9553, + "step": 567 + }, + { + "epoch": 4.854700854700854, + "grad_norm": 22.187759399414062, + "learning_rate": 7.572649572649573e-06, + "loss": 2.0309, + "step": 568 + }, + { + "epoch": 4.863247863247864, + "grad_norm": 53.56728744506836, + "learning_rate": 7.5683760683760685e-06, + "loss": 2.6508, + "step": 569 + }, + { + "epoch": 4.871794871794872, + "grad_norm": 27.983978271484375, + "learning_rate": 7.564102564102564e-06, + "loss": 2.1942, + "step": 570 + }, + { + "epoch": 4.880341880341881, + "grad_norm": 25.610252380371094, + "learning_rate": 7.55982905982906e-06, + "loss": 1.4151, + "step": 571 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 19.856618881225586, + "learning_rate": 7.555555555555556e-06, + "loss": 1.6968, + "step": 572 + }, + { + "epoch": 4.897435897435898, + "grad_norm": 20.288606643676758, + "learning_rate": 7.551282051282052e-06, + "loss": 1.7494, + "step": 573 + }, + { + "epoch": 4.905982905982906, + "grad_norm": 23.206768035888672, + "learning_rate": 7.547008547008547e-06, + "loss": 2.1255, + "step": 574 + }, + { + "epoch": 4.914529914529915, + "grad_norm": 21.275257110595703, + "learning_rate": 7.542735042735043e-06, + "loss": 1.7442, + "step": 575 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 22.635417938232422, + "learning_rate": 7.538461538461539e-06, + "loss": 1.9129, + "step": 576 + }, + { + "epoch": 4.931623931623932, + "grad_norm": 21.440109252929688, + "learning_rate": 7.534188034188035e-06, + "loss": 2.0056, + "step": 577 + }, + { + "epoch": 4.94017094017094, + "grad_norm": 20.939407348632812, + "learning_rate": 7.529914529914531e-06, + "loss": 1.7231, + "step": 578 + }, + { + "epoch": 4.948717948717949, + "grad_norm": 16.189861297607422, + "learning_rate": 7.5256410256410265e-06, + "loss": 1.4255, + "step": 579 + }, + { + "epoch": 4.957264957264957, + "grad_norm": 23.6302547454834, + "learning_rate": 7.521367521367522e-06, + "loss": 1.6748, + "step": 580 + }, + { + "epoch": 4.965811965811966, + "grad_norm": 22.29713249206543, + "learning_rate": 7.5170940170940175e-06, + "loss": 1.5285, + "step": 581 + }, + { + "epoch": 4.9743589743589745, + "grad_norm": 22.831275939941406, + "learning_rate": 7.512820512820513e-06, + "loss": 1.7742, + "step": 582 + }, + { + "epoch": 4.982905982905983, + "grad_norm": 630.5899658203125, + "learning_rate": 7.508547008547009e-06, + "loss": 2.8598, + "step": 583 + }, + { + "epoch": 4.9914529914529915, + "grad_norm": 22.880647659301758, + "learning_rate": 7.504273504273505e-06, + "loss": 1.6231, + "step": 584 + }, + { + "epoch": 5.0, + "grad_norm": 21.379072189331055, + "learning_rate": 7.500000000000001e-06, + "loss": 1.3506, + "step": 585 + }, + { + "epoch": 5.0, + "eval_loss": 0.8325614333152771, + "eval_runtime": 9.2303, + "eval_samples_per_second": 50.486, + "eval_steps_per_second": 6.392, + "step": 585 + }, + { + "epoch": 5.0085470085470085, + "grad_norm": 23.968698501586914, + "learning_rate": 7.495726495726496e-06, + "loss": 1.4263, + "step": 586 + }, + { + "epoch": 5.017094017094017, + "grad_norm": 24.880769729614258, + "learning_rate": 7.491452991452992e-06, + "loss": 1.4994, + "step": 587 + }, + { + "epoch": 5.0256410256410255, + "grad_norm": 23.4547176361084, + "learning_rate": 7.487179487179488e-06, + "loss": 1.671, + "step": 588 + }, + { + "epoch": 5.034188034188034, + "grad_norm": 17.382152557373047, + "learning_rate": 7.482905982905984e-06, + "loss": 1.3935, + "step": 589 + }, + { + "epoch": 5.042735042735043, + "grad_norm": 19.607717514038086, + "learning_rate": 7.47863247863248e-06, + "loss": 1.5652, + "step": 590 + }, + { + "epoch": 5.051282051282051, + "grad_norm": 27.735240936279297, + "learning_rate": 7.474358974358975e-06, + "loss": 1.5491, + "step": 591 + }, + { + "epoch": 5.05982905982906, + "grad_norm": 20.493412017822266, + "learning_rate": 7.4700854700854706e-06, + "loss": 1.9229, + "step": 592 + }, + { + "epoch": 5.068376068376068, + "grad_norm": 20.492137908935547, + "learning_rate": 7.4658119658119665e-06, + "loss": 1.5066, + "step": 593 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 27.650495529174805, + "learning_rate": 7.461538461538462e-06, + "loss": 1.4228, + "step": 594 + }, + { + "epoch": 5.085470085470085, + "grad_norm": 22.38190269470215, + "learning_rate": 7.457264957264958e-06, + "loss": 1.6243, + "step": 595 + }, + { + "epoch": 5.094017094017094, + "grad_norm": 22.862489700317383, + "learning_rate": 7.452991452991454e-06, + "loss": 1.9224, + "step": 596 + }, + { + "epoch": 5.102564102564102, + "grad_norm": 17.368051528930664, + "learning_rate": 7.448717948717949e-06, + "loss": 1.3642, + "step": 597 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 20.587018966674805, + "learning_rate": 7.444444444444445e-06, + "loss": 1.471, + "step": 598 + }, + { + "epoch": 5.119658119658119, + "grad_norm": 18.502887725830078, + "learning_rate": 7.440170940170941e-06, + "loss": 1.9841, + "step": 599 + }, + { + "epoch": 5.128205128205128, + "grad_norm": 21.305294036865234, + "learning_rate": 7.435897435897437e-06, + "loss": 1.8564, + "step": 600 + }, + { + "epoch": 5.136752136752137, + "grad_norm": 20.61264419555664, + "learning_rate": 7.431623931623933e-06, + "loss": 1.3554, + "step": 601 + }, + { + "epoch": 5.145299145299146, + "grad_norm": 19.05555534362793, + "learning_rate": 7.427350427350429e-06, + "loss": 1.6612, + "step": 602 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 20.392446517944336, + "learning_rate": 7.423076923076924e-06, + "loss": 1.5071, + "step": 603 + }, + { + "epoch": 5.162393162393163, + "grad_norm": 22.007591247558594, + "learning_rate": 7.4188034188034196e-06, + "loss": 1.3356, + "step": 604 + }, + { + "epoch": 5.170940170940171, + "grad_norm": 18.928104400634766, + "learning_rate": 7.4145299145299155e-06, + "loss": 1.6214, + "step": 605 + }, + { + "epoch": 5.17948717948718, + "grad_norm": 21.151193618774414, + "learning_rate": 7.410256410256411e-06, + "loss": 1.5275, + "step": 606 + }, + { + "epoch": 5.188034188034188, + "grad_norm": 16.272262573242188, + "learning_rate": 7.405982905982907e-06, + "loss": 1.2773, + "step": 607 + }, + { + "epoch": 5.196581196581197, + "grad_norm": 21.59275245666504, + "learning_rate": 7.401709401709402e-06, + "loss": 1.3503, + "step": 608 + }, + { + "epoch": 5.205128205128205, + "grad_norm": 84.31806182861328, + "learning_rate": 7.397435897435898e-06, + "loss": 1.8618, + "step": 609 + }, + { + "epoch": 5.213675213675214, + "grad_norm": 20.374465942382812, + "learning_rate": 7.393162393162394e-06, + "loss": 1.6153, + "step": 610 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 18.569623947143555, + "learning_rate": 7.38888888888889e-06, + "loss": 1.7101, + "step": 611 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 19.51409339904785, + "learning_rate": 7.384615384615386e-06, + "loss": 1.5801, + "step": 612 + }, + { + "epoch": 5.239316239316239, + "grad_norm": 19.45322608947754, + "learning_rate": 7.380341880341882e-06, + "loss": 1.1376, + "step": 613 + }, + { + "epoch": 5.247863247863248, + "grad_norm": 23.474557876586914, + "learning_rate": 7.376068376068377e-06, + "loss": 1.442, + "step": 614 + }, + { + "epoch": 5.256410256410256, + "grad_norm": 21.458847045898438, + "learning_rate": 7.371794871794873e-06, + "loss": 1.2769, + "step": 615 + }, + { + "epoch": 5.264957264957265, + "grad_norm": 25.741121292114258, + "learning_rate": 7.3675213675213686e-06, + "loss": 1.3321, + "step": 616 + }, + { + "epoch": 5.273504273504273, + "grad_norm": 15.394718170166016, + "learning_rate": 7.3632478632478645e-06, + "loss": 1.2335, + "step": 617 + }, + { + "epoch": 5.282051282051282, + "grad_norm": 20.938871383666992, + "learning_rate": 7.35897435897436e-06, + "loss": 1.5741, + "step": 618 + }, + { + "epoch": 5.2905982905982905, + "grad_norm": 19.348268508911133, + "learning_rate": 7.354700854700856e-06, + "loss": 1.2493, + "step": 619 + }, + { + "epoch": 5.299145299145299, + "grad_norm": 25.26751708984375, + "learning_rate": 7.350427350427351e-06, + "loss": 1.5167, + "step": 620 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 22.099227905273438, + "learning_rate": 7.346153846153847e-06, + "loss": 1.3269, + "step": 621 + }, + { + "epoch": 5.316239316239316, + "grad_norm": 21.483428955078125, + "learning_rate": 7.341880341880342e-06, + "loss": 1.4249, + "step": 622 + }, + { + "epoch": 5.3247863247863245, + "grad_norm": 20.089691162109375, + "learning_rate": 7.337606837606837e-06, + "loss": 1.351, + "step": 623 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 138.9898223876953, + "learning_rate": 7.333333333333333e-06, + "loss": 1.5682, + "step": 624 + }, + { + "epoch": 5.3418803418803416, + "grad_norm": 16.808000564575195, + "learning_rate": 7.329059829059829e-06, + "loss": 1.4794, + "step": 625 + }, + { + "epoch": 5.35042735042735, + "grad_norm": 18.58464813232422, + "learning_rate": 7.324786324786325e-06, + "loss": 1.4486, + "step": 626 + }, + { + "epoch": 5.358974358974359, + "grad_norm": 15.074477195739746, + "learning_rate": 7.320512820512821e-06, + "loss": 1.3124, + "step": 627 + }, + { + "epoch": 5.367521367521368, + "grad_norm": 15.800148963928223, + "learning_rate": 7.316239316239317e-06, + "loss": 1.7055, + "step": 628 + }, + { + "epoch": 5.3760683760683765, + "grad_norm": 19.166179656982422, + "learning_rate": 7.311965811965812e-06, + "loss": 1.7306, + "step": 629 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 55.91648864746094, + "learning_rate": 7.307692307692308e-06, + "loss": 1.2376, + "step": 630 + }, + { + "epoch": 5.3931623931623935, + "grad_norm": 16.606033325195312, + "learning_rate": 7.3034188034188035e-06, + "loss": 1.1159, + "step": 631 + }, + { + "epoch": 5.401709401709402, + "grad_norm": 17.0134220123291, + "learning_rate": 7.299145299145299e-06, + "loss": 1.2124, + "step": 632 + }, + { + "epoch": 5.410256410256411, + "grad_norm": 17.511932373046875, + "learning_rate": 7.294871794871795e-06, + "loss": 1.4221, + "step": 633 + }, + { + "epoch": 5.418803418803419, + "grad_norm": 44.53416061401367, + "learning_rate": 7.290598290598291e-06, + "loss": 1.9583, + "step": 634 + }, + { + "epoch": 5.427350427350428, + "grad_norm": 16.546630859375, + "learning_rate": 7.286324786324786e-06, + "loss": 1.1722, + "step": 635 + }, + { + "epoch": 5.435897435897436, + "grad_norm": 39.90822982788086, + "learning_rate": 7.282051282051282e-06, + "loss": 1.7482, + "step": 636 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 16.186573028564453, + "learning_rate": 7.277777777777778e-06, + "loss": 1.3422, + "step": 637 + }, + { + "epoch": 5.452991452991453, + "grad_norm": 18.84516143798828, + "learning_rate": 7.273504273504274e-06, + "loss": 1.3299, + "step": 638 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 14.620058059692383, + "learning_rate": 7.26923076923077e-06, + "loss": 1.0604, + "step": 639 + }, + { + "epoch": 5.47008547008547, + "grad_norm": 16.5911865234375, + "learning_rate": 7.264957264957266e-06, + "loss": 1.1138, + "step": 640 + }, + { + "epoch": 5.478632478632479, + "grad_norm": 15.44485092163086, + "learning_rate": 7.260683760683761e-06, + "loss": 1.435, + "step": 641 + }, + { + "epoch": 5.487179487179487, + "grad_norm": 121.76724243164062, + "learning_rate": 7.256410256410257e-06, + "loss": 1.7167, + "step": 642 + }, + { + "epoch": 5.495726495726496, + "grad_norm": 1996.141357421875, + "learning_rate": 7.2521367521367525e-06, + "loss": 4.0296, + "step": 643 + }, + { + "epoch": 5.504273504273504, + "grad_norm": 15.072067260742188, + "learning_rate": 7.247863247863248e-06, + "loss": 1.0455, + "step": 644 + }, + { + "epoch": 5.512820512820513, + "grad_norm": 16.684345245361328, + "learning_rate": 7.243589743589744e-06, + "loss": 1.7565, + "step": 645 + }, + { + "epoch": 5.521367521367521, + "grad_norm": 15.515148162841797, + "learning_rate": 7.239316239316239e-06, + "loss": 1.4601, + "step": 646 + }, + { + "epoch": 5.52991452991453, + "grad_norm": 20.1015625, + "learning_rate": 7.235042735042735e-06, + "loss": 1.073, + "step": 647 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 67.10873413085938, + "learning_rate": 7.230769230769231e-06, + "loss": 1.8586, + "step": 648 + }, + { + "epoch": 5.547008547008547, + "grad_norm": 13.775193214416504, + "learning_rate": 7.226495726495727e-06, + "loss": 1.2891, + "step": 649 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 14.612048149108887, + "learning_rate": 7.222222222222223e-06, + "loss": 1.033, + "step": 650 + }, + { + "epoch": 5.564102564102564, + "grad_norm": 14.512042999267578, + "learning_rate": 7.217948717948719e-06, + "loss": 1.1446, + "step": 651 + }, + { + "epoch": 5.572649572649572, + "grad_norm": 13.720820426940918, + "learning_rate": 7.213675213675214e-06, + "loss": 1.1246, + "step": 652 + }, + { + "epoch": 5.581196581196581, + "grad_norm": 16.548046112060547, + "learning_rate": 7.20940170940171e-06, + "loss": 1.3162, + "step": 653 + }, + { + "epoch": 5.589743589743589, + "grad_norm": 20.535181045532227, + "learning_rate": 7.205128205128206e-06, + "loss": 1.3019, + "step": 654 + }, + { + "epoch": 5.598290598290598, + "grad_norm": 14.317465782165527, + "learning_rate": 7.2008547008547015e-06, + "loss": 1.5447, + "step": 655 + }, + { + "epoch": 5.6068376068376065, + "grad_norm": 16.23088836669922, + "learning_rate": 7.196581196581197e-06, + "loss": 1.2701, + "step": 656 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 13.754173278808594, + "learning_rate": 7.192307692307693e-06, + "loss": 1.2218, + "step": 657 + }, + { + "epoch": 5.6239316239316235, + "grad_norm": 75.77688598632812, + "learning_rate": 7.188034188034188e-06, + "loss": 1.7547, + "step": 658 + }, + { + "epoch": 5.632478632478632, + "grad_norm": 19.452077865600586, + "learning_rate": 7.183760683760684e-06, + "loss": 1.1446, + "step": 659 + }, + { + "epoch": 5.641025641025641, + "grad_norm": 14.513677597045898, + "learning_rate": 7.17948717948718e-06, + "loss": 1.0527, + "step": 660 + }, + { + "epoch": 5.64957264957265, + "grad_norm": 27.67446517944336, + "learning_rate": 7.175213675213676e-06, + "loss": 1.1953, + "step": 661 + }, + { + "epoch": 5.6581196581196584, + "grad_norm": 12.137639999389648, + "learning_rate": 7.170940170940172e-06, + "loss": 1.1127, + "step": 662 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 17.2878475189209, + "learning_rate": 7.166666666666667e-06, + "loss": 1.0475, + "step": 663 + }, + { + "epoch": 5.6752136752136755, + "grad_norm": 28.070842742919922, + "learning_rate": 7.162393162393163e-06, + "loss": 1.6271, + "step": 664 + }, + { + "epoch": 5.683760683760684, + "grad_norm": 17.74942398071289, + "learning_rate": 7.158119658119659e-06, + "loss": 1.1759, + "step": 665 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 19.545486450195312, + "learning_rate": 7.153846153846155e-06, + "loss": 0.9753, + "step": 666 + }, + { + "epoch": 5.700854700854701, + "grad_norm": 24.34153938293457, + "learning_rate": 7.1495726495726505e-06, + "loss": 1.0905, + "step": 667 + }, + { + "epoch": 5.7094017094017095, + "grad_norm": 211.7845001220703, + "learning_rate": 7.145299145299146e-06, + "loss": 1.6455, + "step": 668 + }, + { + "epoch": 5.717948717948718, + "grad_norm": 14.03074836730957, + "learning_rate": 7.1410256410256414e-06, + "loss": 1.3728, + "step": 669 + }, + { + "epoch": 5.726495726495727, + "grad_norm": 27.600345611572266, + "learning_rate": 7.136752136752137e-06, + "loss": 1.4212, + "step": 670 + }, + { + "epoch": 5.735042735042735, + "grad_norm": 15.755846977233887, + "learning_rate": 7.132478632478633e-06, + "loss": 1.148, + "step": 671 + }, + { + "epoch": 5.743589743589744, + "grad_norm": 12.816133499145508, + "learning_rate": 7.128205128205129e-06, + "loss": 1.0053, + "step": 672 + }, + { + "epoch": 5.752136752136752, + "grad_norm": 25.097660064697266, + "learning_rate": 7.123931623931625e-06, + "loss": 1.1561, + "step": 673 + }, + { + "epoch": 5.760683760683761, + "grad_norm": 19.249279022216797, + "learning_rate": 7.119658119658121e-06, + "loss": 1.2582, + "step": 674 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 18.606924057006836, + "learning_rate": 7.115384615384616e-06, + "loss": 0.8569, + "step": 675 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 20.2148380279541, + "learning_rate": 7.111111111111112e-06, + "loss": 1.1126, + "step": 676 + }, + { + "epoch": 5.786324786324786, + "grad_norm": 18.623268127441406, + "learning_rate": 7.106837606837608e-06, + "loss": 1.6129, + "step": 677 + }, + { + "epoch": 5.794871794871795, + "grad_norm": 14.888258934020996, + "learning_rate": 7.102564102564104e-06, + "loss": 1.2533, + "step": 678 + }, + { + "epoch": 5.803418803418803, + "grad_norm": 15.351551055908203, + "learning_rate": 7.0982905982905995e-06, + "loss": 1.2392, + "step": 679 + }, + { + "epoch": 5.811965811965812, + "grad_norm": 23.243993759155273, + "learning_rate": 7.0940170940170945e-06, + "loss": 1.3136, + "step": 680 + }, + { + "epoch": 5.82051282051282, + "grad_norm": 18.346277236938477, + "learning_rate": 7.0897435897435904e-06, + "loss": 1.5691, + "step": 681 + }, + { + "epoch": 5.829059829059829, + "grad_norm": 12.904829025268555, + "learning_rate": 7.085470085470086e-06, + "loss": 0.9248, + "step": 682 + }, + { + "epoch": 5.837606837606837, + "grad_norm": 13.263056755065918, + "learning_rate": 7.081196581196582e-06, + "loss": 1.0555, + "step": 683 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 19.311899185180664, + "learning_rate": 7.076923076923078e-06, + "loss": 1.4341, + "step": 684 + }, + { + "epoch": 5.854700854700854, + "grad_norm": 282.1452331542969, + "learning_rate": 7.072649572649574e-06, + "loss": 1.9797, + "step": 685 + }, + { + "epoch": 5.863247863247864, + "grad_norm": 14.317438125610352, + "learning_rate": 7.068376068376069e-06, + "loss": 0.839, + "step": 686 + }, + { + "epoch": 5.871794871794872, + "grad_norm": 13.549150466918945, + "learning_rate": 7.064102564102565e-06, + "loss": 1.1003, + "step": 687 + }, + { + "epoch": 5.880341880341881, + "grad_norm": 14.283610343933105, + "learning_rate": 7.059829059829061e-06, + "loss": 1.0297, + "step": 688 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 18.737884521484375, + "learning_rate": 7.055555555555557e-06, + "loss": 0.9817, + "step": 689 + }, + { + "epoch": 5.897435897435898, + "grad_norm": 24.12625503540039, + "learning_rate": 7.051282051282053e-06, + "loss": 1.1837, + "step": 690 + }, + { + "epoch": 5.905982905982906, + "grad_norm": 11.760732650756836, + "learning_rate": 7.0470085470085485e-06, + "loss": 1.5131, + "step": 691 + }, + { + "epoch": 5.914529914529915, + "grad_norm": 16.138668060302734, + "learning_rate": 7.0427350427350435e-06, + "loss": 0.9569, + "step": 692 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 17.727285385131836, + "learning_rate": 7.038461538461539e-06, + "loss": 0.9834, + "step": 693 + }, + { + "epoch": 5.931623931623932, + "grad_norm": 13.434252738952637, + "learning_rate": 7.034188034188035e-06, + "loss": 1.3635, + "step": 694 + }, + { + "epoch": 5.94017094017094, + "grad_norm": 15.587186813354492, + "learning_rate": 7.02991452991453e-06, + "loss": 1.4814, + "step": 695 + }, + { + "epoch": 5.948717948717949, + "grad_norm": 31.379039764404297, + "learning_rate": 7.025641025641025e-06, + "loss": 0.8792, + "step": 696 + }, + { + "epoch": 5.957264957264957, + "grad_norm": 14.575559616088867, + "learning_rate": 7.021367521367521e-06, + "loss": 0.8865, + "step": 697 + }, + { + "epoch": 5.965811965811966, + "grad_norm": 13.55718994140625, + "learning_rate": 7.017094017094017e-06, + "loss": 0.9564, + "step": 698 + }, + { + "epoch": 5.9743589743589745, + "grad_norm": 13.288110733032227, + "learning_rate": 7.012820512820513e-06, + "loss": 0.8117, + "step": 699 + }, + { + "epoch": 5.982905982905983, + "grad_norm": 14.522254943847656, + "learning_rate": 7.008547008547009e-06, + "loss": 1.2037, + "step": 700 + }, + { + "epoch": 5.9914529914529915, + "grad_norm": 14.575456619262695, + "learning_rate": 7.004273504273504e-06, + "loss": 1.028, + "step": 701 + }, + { + "epoch": 6.0, + "grad_norm": 13.18249225616455, + "learning_rate": 7e-06, + "loss": 0.6528, + "step": 702 + }, + { + "epoch": 6.0, + "eval_loss": 0.4769609868526459, + "eval_runtime": 9.253, + "eval_samples_per_second": 50.362, + "eval_steps_per_second": 6.376, + "step": 702 + }, + { + "epoch": 6.0085470085470085, + "grad_norm": 17.034433364868164, + "learning_rate": 6.995726495726496e-06, + "loss": 0.847, + "step": 703 + }, + { + "epoch": 6.017094017094017, + "grad_norm": 13.455194473266602, + "learning_rate": 6.991452991452992e-06, + "loss": 0.8545, + "step": 704 + }, + { + "epoch": 6.0256410256410255, + "grad_norm": 14.511704444885254, + "learning_rate": 6.9871794871794876e-06, + "loss": 0.9365, + "step": 705 + }, + { + "epoch": 6.034188034188034, + "grad_norm": 14.325255393981934, + "learning_rate": 6.9829059829059835e-06, + "loss": 0.869, + "step": 706 + }, + { + "epoch": 6.042735042735043, + "grad_norm": 12.944524765014648, + "learning_rate": 6.9786324786324785e-06, + "loss": 1.1417, + "step": 707 + }, + { + "epoch": 6.051282051282051, + "grad_norm": 14.992669105529785, + "learning_rate": 6.974358974358974e-06, + "loss": 1.4935, + "step": 708 + }, + { + "epoch": 6.05982905982906, + "grad_norm": 15.394392013549805, + "learning_rate": 6.97008547008547e-06, + "loss": 1.519, + "step": 709 + }, + { + "epoch": 6.068376068376068, + "grad_norm": 12.605085372924805, + "learning_rate": 6.965811965811966e-06, + "loss": 1.4419, + "step": 710 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 16.47636604309082, + "learning_rate": 6.961538461538462e-06, + "loss": 0.9552, + "step": 711 + }, + { + "epoch": 6.085470085470085, + "grad_norm": 17.04586410522461, + "learning_rate": 6.957264957264958e-06, + "loss": 0.9847, + "step": 712 + }, + { + "epoch": 6.094017094017094, + "grad_norm": 15.464738845825195, + "learning_rate": 6.952991452991453e-06, + "loss": 0.9272, + "step": 713 + }, + { + "epoch": 6.102564102564102, + "grad_norm": 11.837206840515137, + "learning_rate": 6.948717948717949e-06, + "loss": 1.1682, + "step": 714 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 11.013447761535645, + "learning_rate": 6.944444444444445e-06, + "loss": 1.222, + "step": 715 + }, + { + "epoch": 6.119658119658119, + "grad_norm": 15.37415885925293, + "learning_rate": 6.940170940170941e-06, + "loss": 0.9668, + "step": 716 + }, + { + "epoch": 6.128205128205128, + "grad_norm": 14.077155113220215, + "learning_rate": 6.9358974358974366e-06, + "loss": 0.8448, + "step": 717 + }, + { + "epoch": 6.136752136752137, + "grad_norm": 13.440519332885742, + "learning_rate": 6.931623931623932e-06, + "loss": 0.891, + "step": 718 + }, + { + "epoch": 6.145299145299146, + "grad_norm": 13.059304237365723, + "learning_rate": 6.9273504273504275e-06, + "loss": 0.655, + "step": 719 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 12.96674633026123, + "learning_rate": 6.923076923076923e-06, + "loss": 0.7755, + "step": 720 + }, + { + "epoch": 6.162393162393163, + "grad_norm": 10.921567916870117, + "learning_rate": 6.918803418803419e-06, + "loss": 0.8533, + "step": 721 + }, + { + "epoch": 6.170940170940171, + "grad_norm": 10.439260482788086, + "learning_rate": 6.914529914529915e-06, + "loss": 0.8294, + "step": 722 + }, + { + "epoch": 6.17948717948718, + "grad_norm": 14.948200225830078, + "learning_rate": 6.910256410256411e-06, + "loss": 0.7326, + "step": 723 + }, + { + "epoch": 6.188034188034188, + "grad_norm": 12.733176231384277, + "learning_rate": 6.905982905982906e-06, + "loss": 1.0244, + "step": 724 + }, + { + "epoch": 6.196581196581197, + "grad_norm": 12.432938575744629, + "learning_rate": 6.901709401709402e-06, + "loss": 0.7375, + "step": 725 + }, + { + "epoch": 6.205128205128205, + "grad_norm": 12.047768592834473, + "learning_rate": 6.897435897435898e-06, + "loss": 0.8348, + "step": 726 + }, + { + "epoch": 6.213675213675214, + "grad_norm": 19.029287338256836, + "learning_rate": 6.893162393162394e-06, + "loss": 0.6091, + "step": 727 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 11.650983810424805, + "learning_rate": 6.88888888888889e-06, + "loss": 0.9925, + "step": 728 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 12.12030029296875, + "learning_rate": 6.8846153846153855e-06, + "loss": 1.0205, + "step": 729 + }, + { + "epoch": 6.239316239316239, + "grad_norm": 10.283143997192383, + "learning_rate": 6.880341880341881e-06, + "loss": 0.7726, + "step": 730 + }, + { + "epoch": 6.247863247863248, + "grad_norm": 12.965302467346191, + "learning_rate": 6.8760683760683765e-06, + "loss": 1.1761, + "step": 731 + }, + { + "epoch": 6.256410256410256, + "grad_norm": 9.0562105178833, + "learning_rate": 6.871794871794872e-06, + "loss": 0.9769, + "step": 732 + }, + { + "epoch": 6.264957264957265, + "grad_norm": 13.647340774536133, + "learning_rate": 6.867521367521368e-06, + "loss": 0.7613, + "step": 733 + }, + { + "epoch": 6.273504273504273, + "grad_norm": 11.598361015319824, + "learning_rate": 6.863247863247864e-06, + "loss": 0.6236, + "step": 734 + }, + { + "epoch": 6.282051282051282, + "grad_norm": 10.453935623168945, + "learning_rate": 6.858974358974359e-06, + "loss": 0.9752, + "step": 735 + }, + { + "epoch": 6.2905982905982905, + "grad_norm": 14.108942985534668, + "learning_rate": 6.854700854700855e-06, + "loss": 0.9212, + "step": 736 + }, + { + "epoch": 6.299145299145299, + "grad_norm": 21.230859756469727, + "learning_rate": 6.850427350427351e-06, + "loss": 0.9213, + "step": 737 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 11.801465034484863, + "learning_rate": 6.846153846153847e-06, + "loss": 0.8182, + "step": 738 + }, + { + "epoch": 6.316239316239316, + "grad_norm": 18.9310302734375, + "learning_rate": 6.841880341880343e-06, + "loss": 0.6214, + "step": 739 + }, + { + "epoch": 6.3247863247863245, + "grad_norm": 11.773117065429688, + "learning_rate": 6.837606837606839e-06, + "loss": 0.6221, + "step": 740 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 187.00250244140625, + "learning_rate": 6.833333333333334e-06, + "loss": 1.5211, + "step": 741 + }, + { + "epoch": 6.3418803418803416, + "grad_norm": 70.96250915527344, + "learning_rate": 6.82905982905983e-06, + "loss": 1.3472, + "step": 742 + }, + { + "epoch": 6.35042735042735, + "grad_norm": 11.787941932678223, + "learning_rate": 6.8247863247863255e-06, + "loss": 0.8831, + "step": 743 + }, + { + "epoch": 6.358974358974359, + "grad_norm": 11.33661937713623, + "learning_rate": 6.820512820512821e-06, + "loss": 1.0555, + "step": 744 + }, + { + "epoch": 6.367521367521368, + "grad_norm": 14.255888938903809, + "learning_rate": 6.816239316239317e-06, + "loss": 0.8246, + "step": 745 + }, + { + "epoch": 6.3760683760683765, + "grad_norm": 10.89616870880127, + "learning_rate": 6.811965811965813e-06, + "loss": 1.0179, + "step": 746 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 9.160380363464355, + "learning_rate": 6.807692307692308e-06, + "loss": 0.9019, + "step": 747 + }, + { + "epoch": 6.3931623931623935, + "grad_norm": 12.984644889831543, + "learning_rate": 6.803418803418804e-06, + "loss": 0.649, + "step": 748 + }, + { + "epoch": 6.401709401709402, + "grad_norm": 14.073376655578613, + "learning_rate": 6.7991452991453e-06, + "loss": 0.608, + "step": 749 + }, + { + "epoch": 6.410256410256411, + "grad_norm": 10.354485511779785, + "learning_rate": 6.794871794871796e-06, + "loss": 0.8812, + "step": 750 + }, + { + "epoch": 6.418803418803419, + "grad_norm": 9.121294975280762, + "learning_rate": 6.790598290598292e-06, + "loss": 0.768, + "step": 751 + }, + { + "epoch": 6.427350427350428, + "grad_norm": 10.909361839294434, + "learning_rate": 6.786324786324787e-06, + "loss": 0.8697, + "step": 752 + }, + { + "epoch": 6.435897435897436, + "grad_norm": 26.324186325073242, + "learning_rate": 6.782051282051283e-06, + "loss": 1.2437, + "step": 753 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 11.972411155700684, + "learning_rate": 6.777777777777779e-06, + "loss": 0.6366, + "step": 754 + }, + { + "epoch": 6.452991452991453, + "grad_norm": 25.042150497436523, + "learning_rate": 6.7735042735042745e-06, + "loss": 1.0371, + "step": 755 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 10.331900596618652, + "learning_rate": 6.76923076923077e-06, + "loss": 0.5618, + "step": 756 + }, + { + "epoch": 6.47008547008547, + "grad_norm": 11.925344467163086, + "learning_rate": 6.764957264957266e-06, + "loss": 0.629, + "step": 757 + }, + { + "epoch": 6.478632478632479, + "grad_norm": 10.309441566467285, + "learning_rate": 6.760683760683761e-06, + "loss": 0.7158, + "step": 758 + }, + { + "epoch": 6.487179487179487, + "grad_norm": 11.374105453491211, + "learning_rate": 6.756410256410257e-06, + "loss": 0.6909, + "step": 759 + }, + { + "epoch": 6.495726495726496, + "grad_norm": 11.613142967224121, + "learning_rate": 6.752136752136753e-06, + "loss": 0.6139, + "step": 760 + }, + { + "epoch": 6.504273504273504, + "grad_norm": 14.499147415161133, + "learning_rate": 6.747863247863249e-06, + "loss": 0.7242, + "step": 761 + }, + { + "epoch": 6.512820512820513, + "grad_norm": 13.683001518249512, + "learning_rate": 6.743589743589745e-06, + "loss": 0.9246, + "step": 762 + }, + { + "epoch": 6.521367521367521, + "grad_norm": 11.068865776062012, + "learning_rate": 6.739316239316241e-06, + "loss": 0.8866, + "step": 763 + }, + { + "epoch": 6.52991452991453, + "grad_norm": 13.0232572555542, + "learning_rate": 6.735042735042736e-06, + "loss": 0.86, + "step": 764 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 10.639331817626953, + "learning_rate": 6.730769230769232e-06, + "loss": 0.6928, + "step": 765 + }, + { + "epoch": 6.547008547008547, + "grad_norm": 11.792994499206543, + "learning_rate": 6.7264957264957276e-06, + "loss": 0.6571, + "step": 766 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 15.907414436340332, + "learning_rate": 6.7222222222222235e-06, + "loss": 1.1426, + "step": 767 + }, + { + "epoch": 6.564102564102564, + "grad_norm": 12.207514762878418, + "learning_rate": 6.717948717948718e-06, + "loss": 1.0932, + "step": 768 + }, + { + "epoch": 6.572649572649572, + "grad_norm": 20.145288467407227, + "learning_rate": 6.7136752136752135e-06, + "loss": 0.9706, + "step": 769 + }, + { + "epoch": 6.581196581196581, + "grad_norm": 9.820805549621582, + "learning_rate": 6.7094017094017094e-06, + "loss": 0.4955, + "step": 770 + }, + { + "epoch": 6.589743589743589, + "grad_norm": 10.385655403137207, + "learning_rate": 6.705128205128205e-06, + "loss": 1.0172, + "step": 771 + }, + { + "epoch": 6.598290598290598, + "grad_norm": 11.708373069763184, + "learning_rate": 6.700854700854701e-06, + "loss": 0.8048, + "step": 772 + }, + { + "epoch": 6.6068376068376065, + "grad_norm": 9.812984466552734, + "learning_rate": 6.696581196581196e-06, + "loss": 0.4831, + "step": 773 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 9.146960258483887, + "learning_rate": 6.692307692307692e-06, + "loss": 0.6178, + "step": 774 + }, + { + "epoch": 6.6239316239316235, + "grad_norm": 13.61231517791748, + "learning_rate": 6.688034188034188e-06, + "loss": 0.7812, + "step": 775 + }, + { + "epoch": 6.632478632478632, + "grad_norm": 10.349262237548828, + "learning_rate": 6.683760683760684e-06, + "loss": 0.819, + "step": 776 + }, + { + "epoch": 6.641025641025641, + "grad_norm": 48.387847900390625, + "learning_rate": 6.67948717948718e-06, + "loss": 1.5294, + "step": 777 + }, + { + "epoch": 6.64957264957265, + "grad_norm": 9.540630340576172, + "learning_rate": 6.675213675213676e-06, + "loss": 0.6564, + "step": 778 + }, + { + "epoch": 6.6581196581196584, + "grad_norm": 10.83983039855957, + "learning_rate": 6.670940170940171e-06, + "loss": 0.5109, + "step": 779 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 15.380743026733398, + "learning_rate": 6.666666666666667e-06, + "loss": 0.6504, + "step": 780 + }, + { + "epoch": 6.6752136752136755, + "grad_norm": 16.796918869018555, + "learning_rate": 6.6623931623931625e-06, + "loss": 0.7944, + "step": 781 + }, + { + "epoch": 6.683760683760684, + "grad_norm": 39.64078140258789, + "learning_rate": 6.6581196581196584e-06, + "loss": 0.6929, + "step": 782 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 7.730568885803223, + "learning_rate": 6.653846153846154e-06, + "loss": 0.6284, + "step": 783 + }, + { + "epoch": 6.700854700854701, + "grad_norm": 7.840725898742676, + "learning_rate": 6.64957264957265e-06, + "loss": 0.5113, + "step": 784 + }, + { + "epoch": 6.7094017094017095, + "grad_norm": 13.925577163696289, + "learning_rate": 6.645299145299145e-06, + "loss": 0.6846, + "step": 785 + }, + { + "epoch": 6.717948717948718, + "grad_norm": 10.926531791687012, + "learning_rate": 6.641025641025641e-06, + "loss": 1.3245, + "step": 786 + }, + { + "epoch": 6.726495726495727, + "grad_norm": 10.698541641235352, + "learning_rate": 6.636752136752137e-06, + "loss": 0.6025, + "step": 787 + }, + { + "epoch": 6.735042735042735, + "grad_norm": 7.572136878967285, + "learning_rate": 6.632478632478633e-06, + "loss": 0.5473, + "step": 788 + }, + { + "epoch": 6.743589743589744, + "grad_norm": 26.242990493774414, + "learning_rate": 6.628205128205129e-06, + "loss": 0.5637, + "step": 789 + }, + { + "epoch": 6.752136752136752, + "grad_norm": 8.79776668548584, + "learning_rate": 6.623931623931624e-06, + "loss": 0.7595, + "step": 790 + }, + { + "epoch": 6.760683760683761, + "grad_norm": 8.951017379760742, + "learning_rate": 6.61965811965812e-06, + "loss": 1.0365, + "step": 791 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 13.799118041992188, + "learning_rate": 6.615384615384616e-06, + "loss": 1.4206, + "step": 792 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 674.3671875, + "learning_rate": 6.6111111111111115e-06, + "loss": 1.1752, + "step": 793 + }, + { + "epoch": 6.786324786324786, + "grad_norm": 8.110879898071289, + "learning_rate": 6.606837606837607e-06, + "loss": 0.4668, + "step": 794 + }, + { + "epoch": 6.794871794871795, + "grad_norm": 8.119854927062988, + "learning_rate": 6.602564102564103e-06, + "loss": 0.7689, + "step": 795 + }, + { + "epoch": 6.803418803418803, + "grad_norm": 11.039762496948242, + "learning_rate": 6.598290598290598e-06, + "loss": 0.5636, + "step": 796 + }, + { + "epoch": 6.811965811965812, + "grad_norm": 12.724084854125977, + "learning_rate": 6.594017094017094e-06, + "loss": 0.5072, + "step": 797 + }, + { + "epoch": 6.82051282051282, + "grad_norm": 12.196049690246582, + "learning_rate": 6.58974358974359e-06, + "loss": 0.5073, + "step": 798 + }, + { + "epoch": 6.829059829059829, + "grad_norm": 9.072951316833496, + "learning_rate": 6.585470085470086e-06, + "loss": 0.4855, + "step": 799 + }, + { + "epoch": 6.837606837606837, + "grad_norm": 10.53836441040039, + "learning_rate": 6.581196581196582e-06, + "loss": 1.0017, + "step": 800 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 7.728690147399902, + "learning_rate": 6.576923076923078e-06, + "loss": 0.5784, + "step": 801 + }, + { + "epoch": 6.854700854700854, + "grad_norm": 28.362455368041992, + "learning_rate": 6.572649572649573e-06, + "loss": 1.0295, + "step": 802 + }, + { + "epoch": 6.863247863247864, + "grad_norm": 7.291123390197754, + "learning_rate": 6.568376068376069e-06, + "loss": 0.7836, + "step": 803 + }, + { + "epoch": 6.871794871794872, + "grad_norm": 9.566614151000977, + "learning_rate": 6.564102564102565e-06, + "loss": 0.9979, + "step": 804 + }, + { + "epoch": 6.880341880341881, + "grad_norm": 13.544408798217773, + "learning_rate": 6.5598290598290605e-06, + "loss": 0.5354, + "step": 805 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 8.546881675720215, + "learning_rate": 6.555555555555556e-06, + "loss": 0.4689, + "step": 806 + }, + { + "epoch": 6.897435897435898, + "grad_norm": 8.94822883605957, + "learning_rate": 6.5512820512820515e-06, + "loss": 0.4432, + "step": 807 + }, + { + "epoch": 6.905982905982906, + "grad_norm": 6.5176544189453125, + "learning_rate": 6.547008547008547e-06, + "loss": 0.6747, + "step": 808 + }, + { + "epoch": 6.914529914529915, + "grad_norm": 9.48947811126709, + "learning_rate": 6.542735042735043e-06, + "loss": 0.4268, + "step": 809 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 11.432586669921875, + "learning_rate": 6.538461538461539e-06, + "loss": 0.5486, + "step": 810 + }, + { + "epoch": 6.931623931623932, + "grad_norm": 7.585604667663574, + "learning_rate": 6.534188034188035e-06, + "loss": 0.4412, + "step": 811 + }, + { + "epoch": 6.94017094017094, + "grad_norm": 7.860292911529541, + "learning_rate": 6.529914529914531e-06, + "loss": 0.6428, + "step": 812 + }, + { + "epoch": 6.948717948717949, + "grad_norm": 27.83890151977539, + "learning_rate": 6.525641025641026e-06, + "loss": 0.6735, + "step": 813 + }, + { + "epoch": 6.957264957264957, + "grad_norm": 10.266451835632324, + "learning_rate": 6.521367521367522e-06, + "loss": 0.6757, + "step": 814 + }, + { + "epoch": 6.965811965811966, + "grad_norm": 8.839099884033203, + "learning_rate": 6.517094017094018e-06, + "loss": 0.7897, + "step": 815 + }, + { + "epoch": 6.9743589743589745, + "grad_norm": 10.037760734558105, + "learning_rate": 6.512820512820514e-06, + "loss": 0.7133, + "step": 816 + }, + { + "epoch": 6.982905982905983, + "grad_norm": 14.50278377532959, + "learning_rate": 6.5085470085470095e-06, + "loss": 1.0051, + "step": 817 + }, + { + "epoch": 6.9914529914529915, + "grad_norm": 8.775527000427246, + "learning_rate": 6.504273504273505e-06, + "loss": 0.8769, + "step": 818 + }, + { + "epoch": 7.0, + "grad_norm": 8.891378402709961, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.9586, + "step": 819 + }, + { + "epoch": 7.0, + "eval_loss": 0.23673956096172333, + "eval_runtime": 9.3447, + "eval_samples_per_second": 49.868, + "eval_steps_per_second": 6.314, + "step": 819 + }, + { + "epoch": 7.0085470085470085, + "grad_norm": 8.925857543945312, + "learning_rate": 6.495726495726496e-06, + "loss": 1.0406, + "step": 820 + }, + { + "epoch": 7.017094017094017, + "grad_norm": 8.222796440124512, + "learning_rate": 6.491452991452992e-06, + "loss": 0.4911, + "step": 821 + }, + { + "epoch": 7.0256410256410255, + "grad_norm": 11.528886795043945, + "learning_rate": 6.487179487179488e-06, + "loss": 0.8292, + "step": 822 + }, + { + "epoch": 7.034188034188034, + "grad_norm": 7.9031524658203125, + "learning_rate": 6.482905982905984e-06, + "loss": 0.5319, + "step": 823 + }, + { + "epoch": 7.042735042735043, + "grad_norm": 6.788857936859131, + "learning_rate": 6.478632478632479e-06, + "loss": 0.431, + "step": 824 + }, + { + "epoch": 7.051282051282051, + "grad_norm": 8.84765911102295, + "learning_rate": 6.474358974358975e-06, + "loss": 0.6417, + "step": 825 + }, + { + "epoch": 7.05982905982906, + "grad_norm": 7.517561435699463, + "learning_rate": 6.470085470085471e-06, + "loss": 0.5828, + "step": 826 + }, + { + "epoch": 7.068376068376068, + "grad_norm": 9.86832332611084, + "learning_rate": 6.465811965811967e-06, + "loss": 0.5851, + "step": 827 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 9.632494926452637, + "learning_rate": 6.461538461538463e-06, + "loss": 0.769, + "step": 828 + }, + { + "epoch": 7.085470085470085, + "grad_norm": 9.874857902526855, + "learning_rate": 6.4572649572649585e-06, + "loss": 0.4393, + "step": 829 + }, + { + "epoch": 7.094017094017094, + "grad_norm": 11.78085994720459, + "learning_rate": 6.4529914529914535e-06, + "loss": 0.8784, + "step": 830 + }, + { + "epoch": 7.102564102564102, + "grad_norm": 8.85053825378418, + "learning_rate": 6.4487179487179494e-06, + "loss": 0.5911, + "step": 831 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 12.405013084411621, + "learning_rate": 6.444444444444445e-06, + "loss": 0.4941, + "step": 832 + }, + { + "epoch": 7.119658119658119, + "grad_norm": 12.237760543823242, + "learning_rate": 6.440170940170941e-06, + "loss": 0.4468, + "step": 833 + }, + { + "epoch": 7.128205128205128, + "grad_norm": 7.945899486541748, + "learning_rate": 6.435897435897437e-06, + "loss": 0.4101, + "step": 834 + }, + { + "epoch": 7.136752136752137, + "grad_norm": 10.743217468261719, + "learning_rate": 6.431623931623933e-06, + "loss": 0.679, + "step": 835 + }, + { + "epoch": 7.145299145299146, + "grad_norm": 7.700406551361084, + "learning_rate": 6.427350427350428e-06, + "loss": 0.5067, + "step": 836 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 8.401918411254883, + "learning_rate": 6.423076923076924e-06, + "loss": 0.5893, + "step": 837 + }, + { + "epoch": 7.162393162393163, + "grad_norm": 23.065881729125977, + "learning_rate": 6.41880341880342e-06, + "loss": 0.6768, + "step": 838 + }, + { + "epoch": 7.170940170940171, + "grad_norm": 38.71855545043945, + "learning_rate": 6.414529914529916e-06, + "loss": 0.8828, + "step": 839 + }, + { + "epoch": 7.17948717948718, + "grad_norm": 12.142110824584961, + "learning_rate": 6.410256410256412e-06, + "loss": 0.5444, + "step": 840 + }, + { + "epoch": 7.188034188034188, + "grad_norm": 69.4731674194336, + "learning_rate": 6.405982905982906e-06, + "loss": 0.7768, + "step": 841 + }, + { + "epoch": 7.196581196581197, + "grad_norm": 15.926841735839844, + "learning_rate": 6.401709401709402e-06, + "loss": 0.4348, + "step": 842 + }, + { + "epoch": 7.205128205128205, + "grad_norm": 6.8418965339660645, + "learning_rate": 6.397435897435898e-06, + "loss": 0.3821, + "step": 843 + }, + { + "epoch": 7.213675213675214, + "grad_norm": 6.716574192047119, + "learning_rate": 6.3931623931623935e-06, + "loss": 0.3621, + "step": 844 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 7.452919006347656, + "learning_rate": 6.3888888888888885e-06, + "loss": 0.4997, + "step": 845 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 11.502019882202148, + "learning_rate": 6.384615384615384e-06, + "loss": 0.8017, + "step": 846 + }, + { + "epoch": 7.239316239316239, + "grad_norm": 7.349746227264404, + "learning_rate": 6.38034188034188e-06, + "loss": 0.2745, + "step": 847 + }, + { + "epoch": 7.247863247863248, + "grad_norm": 6.269787311553955, + "learning_rate": 6.376068376068376e-06, + "loss": 0.4131, + "step": 848 + }, + { + "epoch": 7.256410256410256, + "grad_norm": 9.56203842163086, + "learning_rate": 6.371794871794872e-06, + "loss": 0.8147, + "step": 849 + }, + { + "epoch": 7.264957264957265, + "grad_norm": 7.358108043670654, + "learning_rate": 6.367521367521368e-06, + "loss": 0.3552, + "step": 850 + }, + { + "epoch": 7.273504273504273, + "grad_norm": 7.6359782218933105, + "learning_rate": 6.363247863247863e-06, + "loss": 0.3302, + "step": 851 + }, + { + "epoch": 7.282051282051282, + "grad_norm": 7.356925010681152, + "learning_rate": 6.358974358974359e-06, + "loss": 0.2927, + "step": 852 + }, + { + "epoch": 7.2905982905982905, + "grad_norm": 11.097757339477539, + "learning_rate": 6.354700854700855e-06, + "loss": 0.8117, + "step": 853 + }, + { + "epoch": 7.299145299145299, + "grad_norm": 10.301170349121094, + "learning_rate": 6.350427350427351e-06, + "loss": 0.4044, + "step": 854 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 7.116042613983154, + "learning_rate": 6.3461538461538466e-06, + "loss": 0.289, + "step": 855 + }, + { + "epoch": 7.316239316239316, + "grad_norm": 7.453964710235596, + "learning_rate": 6.3418803418803425e-06, + "loss": 0.4652, + "step": 856 + }, + { + "epoch": 7.3247863247863245, + "grad_norm": 11.864774703979492, + "learning_rate": 6.3376068376068375e-06, + "loss": 0.4667, + "step": 857 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 8.79547119140625, + "learning_rate": 6.333333333333333e-06, + "loss": 0.2874, + "step": 858 + }, + { + "epoch": 7.3418803418803416, + "grad_norm": 10.173043251037598, + "learning_rate": 6.329059829059829e-06, + "loss": 0.6844, + "step": 859 + }, + { + "epoch": 7.35042735042735, + "grad_norm": 9.26555061340332, + "learning_rate": 6.324786324786325e-06, + "loss": 0.2903, + "step": 860 + }, + { + "epoch": 7.358974358974359, + "grad_norm": 10.274518013000488, + "learning_rate": 6.320512820512821e-06, + "loss": 0.7824, + "step": 861 + }, + { + "epoch": 7.367521367521368, + "grad_norm": 7.104451656341553, + "learning_rate": 6.316239316239316e-06, + "loss": 0.3024, + "step": 862 + }, + { + "epoch": 7.3760683760683765, + "grad_norm": 9.522738456726074, + "learning_rate": 6.311965811965812e-06, + "loss": 0.3219, + "step": 863 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 10.145588874816895, + "learning_rate": 6.307692307692308e-06, + "loss": 0.5319, + "step": 864 + }, + { + "epoch": 7.3931623931623935, + "grad_norm": 8.828988075256348, + "learning_rate": 6.303418803418804e-06, + "loss": 0.3286, + "step": 865 + }, + { + "epoch": 7.401709401709402, + "grad_norm": 7.314462661743164, + "learning_rate": 6.2991452991453e-06, + "loss": 0.2951, + "step": 866 + }, + { + "epoch": 7.410256410256411, + "grad_norm": 13.465666770935059, + "learning_rate": 6.2948717948717956e-06, + "loss": 0.4046, + "step": 867 + }, + { + "epoch": 7.418803418803419, + "grad_norm": 12.40607738494873, + "learning_rate": 6.290598290598291e-06, + "loss": 0.71, + "step": 868 + }, + { + "epoch": 7.427350427350428, + "grad_norm": 9.282904624938965, + "learning_rate": 6.2863247863247865e-06, + "loss": 0.4083, + "step": 869 + }, + { + "epoch": 7.435897435897436, + "grad_norm": 5.755247116088867, + "learning_rate": 6.282051282051282e-06, + "loss": 0.3858, + "step": 870 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 6.996497631072998, + "learning_rate": 6.277777777777778e-06, + "loss": 0.2692, + "step": 871 + }, + { + "epoch": 7.452991452991453, + "grad_norm": 7.235395431518555, + "learning_rate": 6.273504273504274e-06, + "loss": 0.3936, + "step": 872 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 14.275704383850098, + "learning_rate": 6.26923076923077e-06, + "loss": 0.4022, + "step": 873 + }, + { + "epoch": 7.47008547008547, + "grad_norm": 10.365689277648926, + "learning_rate": 6.264957264957265e-06, + "loss": 1.0508, + "step": 874 + }, + { + "epoch": 7.478632478632479, + "grad_norm": 5.840590000152588, + "learning_rate": 6.260683760683761e-06, + "loss": 0.2511, + "step": 875 + }, + { + "epoch": 7.487179487179487, + "grad_norm": 10.25346851348877, + "learning_rate": 6.256410256410257e-06, + "loss": 0.5836, + "step": 876 + }, + { + "epoch": 7.495726495726496, + "grad_norm": 27.662694931030273, + "learning_rate": 6.252136752136753e-06, + "loss": 0.7677, + "step": 877 + }, + { + "epoch": 7.504273504273504, + "grad_norm": 5.840217590332031, + "learning_rate": 6.247863247863249e-06, + "loss": 0.3889, + "step": 878 + }, + { + "epoch": 7.512820512820513, + "grad_norm": 9.813179016113281, + "learning_rate": 6.243589743589744e-06, + "loss": 0.8929, + "step": 879 + }, + { + "epoch": 7.521367521367521, + "grad_norm": 5.49755334854126, + "learning_rate": 6.23931623931624e-06, + "loss": 0.2712, + "step": 880 + }, + { + "epoch": 7.52991452991453, + "grad_norm": 7.17311429977417, + "learning_rate": 6.2350427350427355e-06, + "loss": 0.3071, + "step": 881 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 7.706870079040527, + "learning_rate": 6.230769230769231e-06, + "loss": 0.3797, + "step": 882 + }, + { + "epoch": 7.547008547008547, + "grad_norm": 7.891415596008301, + "learning_rate": 6.226495726495727e-06, + "loss": 0.5352, + "step": 883 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 8.746044158935547, + "learning_rate": 6.222222222222223e-06, + "loss": 0.263, + "step": 884 + }, + { + "epoch": 7.564102564102564, + "grad_norm": 9.096441268920898, + "learning_rate": 6.217948717948718e-06, + "loss": 0.2736, + "step": 885 + }, + { + "epoch": 7.572649572649572, + "grad_norm": 7.031003475189209, + "learning_rate": 6.213675213675214e-06, + "loss": 0.4705, + "step": 886 + }, + { + "epoch": 7.581196581196581, + "grad_norm": 6.6503143310546875, + "learning_rate": 6.20940170940171e-06, + "loss": 0.3285, + "step": 887 + }, + { + "epoch": 7.589743589743589, + "grad_norm": 5.398913383483887, + "learning_rate": 6.205128205128206e-06, + "loss": 0.41, + "step": 888 + }, + { + "epoch": 7.598290598290598, + "grad_norm": 7.47569465637207, + "learning_rate": 6.200854700854702e-06, + "loss": 0.4005, + "step": 889 + }, + { + "epoch": 7.6068376068376065, + "grad_norm": 8.79906940460205, + "learning_rate": 6.196581196581198e-06, + "loss": 0.2608, + "step": 890 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 7.604002475738525, + "learning_rate": 6.192307692307693e-06, + "loss": 0.577, + "step": 891 + }, + { + "epoch": 7.6239316239316235, + "grad_norm": 12.666848182678223, + "learning_rate": 6.188034188034189e-06, + "loss": 0.7296, + "step": 892 + }, + { + "epoch": 7.632478632478632, + "grad_norm": 20.92390251159668, + "learning_rate": 6.1837606837606845e-06, + "loss": 0.9276, + "step": 893 + }, + { + "epoch": 7.641025641025641, + "grad_norm": 6.779317855834961, + "learning_rate": 6.17948717948718e-06, + "loss": 0.818, + "step": 894 + }, + { + "epoch": 7.64957264957265, + "grad_norm": 5.249539852142334, + "learning_rate": 6.175213675213676e-06, + "loss": 0.2117, + "step": 895 + }, + { + "epoch": 7.6581196581196584, + "grad_norm": 23.55508041381836, + "learning_rate": 6.170940170940171e-06, + "loss": 0.5239, + "step": 896 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 11.711256980895996, + "learning_rate": 6.166666666666667e-06, + "loss": 0.6595, + "step": 897 + }, + { + "epoch": 7.6752136752136755, + "grad_norm": 6.641115188598633, + "learning_rate": 6.162393162393163e-06, + "loss": 0.4888, + "step": 898 + }, + { + "epoch": 7.683760683760684, + "grad_norm": 7.913390159606934, + "learning_rate": 6.158119658119659e-06, + "loss": 0.66, + "step": 899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 17.927574157714844, + "learning_rate": 6.153846153846155e-06, + "loss": 0.9603, + "step": 900 + }, + { + "epoch": 7.700854700854701, + "grad_norm": 4.567203998565674, + "learning_rate": 6.149572649572651e-06, + "loss": 0.1638, + "step": 901 + }, + { + "epoch": 7.7094017094017095, + "grad_norm": 5.995935440063477, + "learning_rate": 6.145299145299146e-06, + "loss": 0.6852, + "step": 902 + }, + { + "epoch": 7.717948717948718, + "grad_norm": 8.323802947998047, + "learning_rate": 6.141025641025642e-06, + "loss": 0.5293, + "step": 903 + }, + { + "epoch": 7.726495726495727, + "grad_norm": 6.8586859703063965, + "learning_rate": 6.136752136752138e-06, + "loss": 0.3265, + "step": 904 + }, + { + "epoch": 7.735042735042735, + "grad_norm": 6.507427215576172, + "learning_rate": 6.1324786324786335e-06, + "loss": 0.2841, + "step": 905 + }, + { + "epoch": 7.743589743589744, + "grad_norm": 6.789999485015869, + "learning_rate": 6.128205128205129e-06, + "loss": 0.4236, + "step": 906 + }, + { + "epoch": 7.752136752136752, + "grad_norm": 19.444454193115234, + "learning_rate": 6.123931623931625e-06, + "loss": 0.2829, + "step": 907 + }, + { + "epoch": 7.760683760683761, + "grad_norm": 31.564800262451172, + "learning_rate": 6.11965811965812e-06, + "loss": 1.093, + "step": 908 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 9.956007957458496, + "learning_rate": 6.115384615384616e-06, + "loss": 0.6749, + "step": 909 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 5.193087577819824, + "learning_rate": 6.111111111111112e-06, + "loss": 0.1986, + "step": 910 + }, + { + "epoch": 7.786324786324786, + "grad_norm": 4.792945384979248, + "learning_rate": 6.106837606837608e-06, + "loss": 0.5179, + "step": 911 + }, + { + "epoch": 7.794871794871795, + "grad_norm": 20.602317810058594, + "learning_rate": 6.102564102564104e-06, + "loss": 1.0343, + "step": 912 + }, + { + "epoch": 7.803418803418803, + "grad_norm": 22.205543518066406, + "learning_rate": 6.098290598290599e-06, + "loss": 0.4921, + "step": 913 + }, + { + "epoch": 7.811965811965812, + "grad_norm": 13.392712593078613, + "learning_rate": 6.094017094017095e-06, + "loss": 0.9058, + "step": 914 + }, + { + "epoch": 7.82051282051282, + "grad_norm": 6.262679100036621, + "learning_rate": 6.08974358974359e-06, + "loss": 0.3877, + "step": 915 + }, + { + "epoch": 7.829059829059829, + "grad_norm": 12.727428436279297, + "learning_rate": 6.085470085470086e-06, + "loss": 0.4477, + "step": 916 + }, + { + "epoch": 7.837606837606837, + "grad_norm": 6.595224380493164, + "learning_rate": 6.081196581196581e-06, + "loss": 0.5553, + "step": 917 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 6.815043926239014, + "learning_rate": 6.076923076923077e-06, + "loss": 0.2978, + "step": 918 + }, + { + "epoch": 7.854700854700854, + "grad_norm": 11.751949310302734, + "learning_rate": 6.0726495726495726e-06, + "loss": 0.5509, + "step": 919 + }, + { + "epoch": 7.863247863247864, + "grad_norm": 6.067570209503174, + "learning_rate": 6.0683760683760684e-06, + "loss": 0.475, + "step": 920 + }, + { + "epoch": 7.871794871794872, + "grad_norm": 7.4297919273376465, + "learning_rate": 6.064102564102564e-06, + "loss": 0.5073, + "step": 921 + }, + { + "epoch": 7.880341880341881, + "grad_norm": 6.778268337249756, + "learning_rate": 6.05982905982906e-06, + "loss": 0.4718, + "step": 922 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 9.401915550231934, + "learning_rate": 6.055555555555555e-06, + "loss": 0.7151, + "step": 923 + }, + { + "epoch": 7.897435897435898, + "grad_norm": 6.359888553619385, + "learning_rate": 6.051282051282051e-06, + "loss": 0.3175, + "step": 924 + }, + { + "epoch": 7.905982905982906, + "grad_norm": 7.036016464233398, + "learning_rate": 6.047008547008547e-06, + "loss": 0.3172, + "step": 925 + }, + { + "epoch": 7.914529914529915, + "grad_norm": 5.980124473571777, + "learning_rate": 6.042735042735043e-06, + "loss": 0.2949, + "step": 926 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 5.738795280456543, + "learning_rate": 6.038461538461539e-06, + "loss": 0.2454, + "step": 927 + }, + { + "epoch": 7.931623931623932, + "grad_norm": 4.688748359680176, + "learning_rate": 6.034188034188035e-06, + "loss": 0.1949, + "step": 928 + }, + { + "epoch": 7.94017094017094, + "grad_norm": 7.2333984375, + "learning_rate": 6.02991452991453e-06, + "loss": 0.2174, + "step": 929 + }, + { + "epoch": 7.948717948717949, + "grad_norm": 6.005523204803467, + "learning_rate": 6.025641025641026e-06, + "loss": 0.4216, + "step": 930 + }, + { + "epoch": 7.957264957264957, + "grad_norm": 6.017541885375977, + "learning_rate": 6.0213675213675215e-06, + "loss": 0.4904, + "step": 931 + }, + { + "epoch": 7.965811965811966, + "grad_norm": 19.559003829956055, + "learning_rate": 6.0170940170940174e-06, + "loss": 0.2616, + "step": 932 + }, + { + "epoch": 7.9743589743589745, + "grad_norm": 5.360724449157715, + "learning_rate": 6.012820512820513e-06, + "loss": 0.3629, + "step": 933 + }, + { + "epoch": 7.982905982905983, + "grad_norm": 9.472721099853516, + "learning_rate": 6.008547008547008e-06, + "loss": 0.5044, + "step": 934 + }, + { + "epoch": 7.9914529914529915, + "grad_norm": 6.453597068786621, + "learning_rate": 6.004273504273504e-06, + "loss": 0.4742, + "step": 935 + }, + { + "epoch": 8.0, + "grad_norm": 7.647386074066162, + "learning_rate": 6e-06, + "loss": 0.402, + "step": 936 + }, + { + "epoch": 8.0, + "eval_loss": 0.1672903448343277, + "eval_runtime": 9.3047, + "eval_samples_per_second": 50.082, + "eval_steps_per_second": 6.341, + "step": 936 + }, + { + "epoch": 8.008547008547009, + "grad_norm": 5.8361663818359375, + "learning_rate": 5.995726495726496e-06, + "loss": 0.164, + "step": 937 + }, + { + "epoch": 8.017094017094017, + "grad_norm": 5.801360130310059, + "learning_rate": 5.991452991452992e-06, + "loss": 0.2858, + "step": 938 + }, + { + "epoch": 8.025641025641026, + "grad_norm": 4.43051290512085, + "learning_rate": 5.987179487179488e-06, + "loss": 0.2068, + "step": 939 + }, + { + "epoch": 8.034188034188034, + "grad_norm": 6.544061660766602, + "learning_rate": 5.982905982905983e-06, + "loss": 0.3499, + "step": 940 + }, + { + "epoch": 8.042735042735043, + "grad_norm": 5.500844955444336, + "learning_rate": 5.978632478632479e-06, + "loss": 0.3134, + "step": 941 + }, + { + "epoch": 8.051282051282051, + "grad_norm": 4.286651611328125, + "learning_rate": 5.974358974358975e-06, + "loss": 0.1767, + "step": 942 + }, + { + "epoch": 8.05982905982906, + "grad_norm": 13.860437393188477, + "learning_rate": 5.9700854700854705e-06, + "loss": 0.3913, + "step": 943 + }, + { + "epoch": 8.068376068376068, + "grad_norm": 5.998767852783203, + "learning_rate": 5.9658119658119664e-06, + "loss": 0.2275, + "step": 944 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 9.01196002960205, + "learning_rate": 5.961538461538462e-06, + "loss": 0.5202, + "step": 945 + }, + { + "epoch": 8.085470085470085, + "grad_norm": 6.81577730178833, + "learning_rate": 5.957264957264957e-06, + "loss": 0.5923, + "step": 946 + }, + { + "epoch": 8.094017094017094, + "grad_norm": 7.400684833526611, + "learning_rate": 5.952991452991453e-06, + "loss": 0.2883, + "step": 947 + }, + { + "epoch": 8.102564102564102, + "grad_norm": 16.18587875366211, + "learning_rate": 5.948717948717949e-06, + "loss": 0.3377, + "step": 948 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 5.017345428466797, + "learning_rate": 5.944444444444445e-06, + "loss": 0.3912, + "step": 949 + }, + { + "epoch": 8.11965811965812, + "grad_norm": 5.300196647644043, + "learning_rate": 5.940170940170941e-06, + "loss": 0.4056, + "step": 950 + }, + { + "epoch": 8.128205128205128, + "grad_norm": 6.3473405838012695, + "learning_rate": 5.935897435897436e-06, + "loss": 0.2559, + "step": 951 + }, + { + "epoch": 8.136752136752136, + "grad_norm": 12.37689208984375, + "learning_rate": 5.931623931623932e-06, + "loss": 0.2216, + "step": 952 + }, + { + "epoch": 8.145299145299145, + "grad_norm": 5.573046684265137, + "learning_rate": 5.927350427350428e-06, + "loss": 0.2047, + "step": 953 + }, + { + "epoch": 8.153846153846153, + "grad_norm": 5.033559322357178, + "learning_rate": 5.923076923076924e-06, + "loss": 0.3661, + "step": 954 + }, + { + "epoch": 8.162393162393162, + "grad_norm": 5.341614246368408, + "learning_rate": 5.9188034188034195e-06, + "loss": 0.2597, + "step": 955 + }, + { + "epoch": 8.17094017094017, + "grad_norm": 8.67937183380127, + "learning_rate": 5.914529914529915e-06, + "loss": 0.4098, + "step": 956 + }, + { + "epoch": 8.179487179487179, + "grad_norm": 3.957489252090454, + "learning_rate": 5.9102564102564105e-06, + "loss": 0.18, + "step": 957 + }, + { + "epoch": 8.188034188034187, + "grad_norm": 6.377108573913574, + "learning_rate": 5.905982905982906e-06, + "loss": 0.3414, + "step": 958 + }, + { + "epoch": 8.196581196581196, + "grad_norm": 8.621227264404297, + "learning_rate": 5.901709401709402e-06, + "loss": 1.1625, + "step": 959 + }, + { + "epoch": 8.205128205128204, + "grad_norm": 5.775392532348633, + "learning_rate": 5.897435897435898e-06, + "loss": 0.4283, + "step": 960 + }, + { + "epoch": 8.213675213675213, + "grad_norm": 4.522337913513184, + "learning_rate": 5.893162393162394e-06, + "loss": 0.3432, + "step": 961 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 5.594667434692383, + "learning_rate": 5.88888888888889e-06, + "loss": 0.5212, + "step": 962 + }, + { + "epoch": 8.23076923076923, + "grad_norm": 5.478531837463379, + "learning_rate": 5.884615384615385e-06, + "loss": 0.2273, + "step": 963 + }, + { + "epoch": 8.239316239316238, + "grad_norm": 6.08770751953125, + "learning_rate": 5.880341880341881e-06, + "loss": 0.2673, + "step": 964 + }, + { + "epoch": 8.247863247863247, + "grad_norm": 7.962898254394531, + "learning_rate": 5.876068376068377e-06, + "loss": 0.2654, + "step": 965 + }, + { + "epoch": 8.256410256410255, + "grad_norm": 6.443154335021973, + "learning_rate": 5.871794871794873e-06, + "loss": 0.2982, + "step": 966 + }, + { + "epoch": 8.264957264957266, + "grad_norm": 4.689123153686523, + "learning_rate": 5.8675213675213685e-06, + "loss": 0.3459, + "step": 967 + }, + { + "epoch": 8.273504273504274, + "grad_norm": 5.446859359741211, + "learning_rate": 5.863247863247864e-06, + "loss": 0.2792, + "step": 968 + }, + { + "epoch": 8.282051282051283, + "grad_norm": 5.562478542327881, + "learning_rate": 5.8589743589743595e-06, + "loss": 0.1939, + "step": 969 + }, + { + "epoch": 8.290598290598291, + "grad_norm": 4.726650714874268, + "learning_rate": 5.854700854700855e-06, + "loss": 0.1368, + "step": 970 + }, + { + "epoch": 8.2991452991453, + "grad_norm": 17.44293785095215, + "learning_rate": 5.850427350427351e-06, + "loss": 0.3836, + "step": 971 + }, + { + "epoch": 8.307692307692308, + "grad_norm": 5.568243980407715, + "learning_rate": 5.846153846153847e-06, + "loss": 0.3674, + "step": 972 + }, + { + "epoch": 8.316239316239317, + "grad_norm": 3.488147258758545, + "learning_rate": 5.841880341880343e-06, + "loss": 0.197, + "step": 973 + }, + { + "epoch": 8.324786324786325, + "grad_norm": 15.902129173278809, + "learning_rate": 5.837606837606838e-06, + "loss": 0.4199, + "step": 974 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 8.055335998535156, + "learning_rate": 5.833333333333334e-06, + "loss": 0.277, + "step": 975 + }, + { + "epoch": 8.341880341880342, + "grad_norm": 8.122756004333496, + "learning_rate": 5.82905982905983e-06, + "loss": 0.5572, + "step": 976 + }, + { + "epoch": 8.350427350427351, + "grad_norm": 5.7439961433410645, + "learning_rate": 5.824786324786326e-06, + "loss": 0.2031, + "step": 977 + }, + { + "epoch": 8.35897435897436, + "grad_norm": 4.329511642456055, + "learning_rate": 5.820512820512822e-06, + "loss": 0.4405, + "step": 978 + }, + { + "epoch": 8.367521367521368, + "grad_norm": 10.946788787841797, + "learning_rate": 5.8162393162393175e-06, + "loss": 0.4619, + "step": 979 + }, + { + "epoch": 8.376068376068377, + "grad_norm": 6.0579352378845215, + "learning_rate": 5.8119658119658126e-06, + "loss": 0.4679, + "step": 980 + }, + { + "epoch": 8.384615384615385, + "grad_norm": 5.656944751739502, + "learning_rate": 5.8076923076923084e-06, + "loss": 0.2395, + "step": 981 + }, + { + "epoch": 8.393162393162394, + "grad_norm": 5.344303607940674, + "learning_rate": 5.803418803418804e-06, + "loss": 0.2516, + "step": 982 + }, + { + "epoch": 8.401709401709402, + "grad_norm": 7.070309638977051, + "learning_rate": 5.7991452991453e-06, + "loss": 0.3169, + "step": 983 + }, + { + "epoch": 8.41025641025641, + "grad_norm": 5.168705940246582, + "learning_rate": 5.794871794871796e-06, + "loss": 0.3007, + "step": 984 + }, + { + "epoch": 8.418803418803419, + "grad_norm": 3.556293249130249, + "learning_rate": 5.790598290598292e-06, + "loss": 0.2089, + "step": 985 + }, + { + "epoch": 8.427350427350428, + "grad_norm": 4.943065166473389, + "learning_rate": 5.786324786324787e-06, + "loss": 0.2093, + "step": 986 + }, + { + "epoch": 8.435897435897436, + "grad_norm": 6.991105556488037, + "learning_rate": 5.782051282051283e-06, + "loss": 0.4671, + "step": 987 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 5.276190280914307, + "learning_rate": 5.777777777777778e-06, + "loss": 0.2092, + "step": 988 + }, + { + "epoch": 8.452991452991453, + "grad_norm": 77.91864776611328, + "learning_rate": 5.773504273504273e-06, + "loss": 1.7536, + "step": 989 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 4.864828109741211, + "learning_rate": 5.769230769230769e-06, + "loss": 0.1669, + "step": 990 + }, + { + "epoch": 8.47008547008547, + "grad_norm": 4.416967391967773, + "learning_rate": 5.764957264957265e-06, + "loss": 0.2705, + "step": 991 + }, + { + "epoch": 8.478632478632479, + "grad_norm": 4.558652400970459, + "learning_rate": 5.760683760683761e-06, + "loss": 0.4332, + "step": 992 + }, + { + "epoch": 8.487179487179487, + "grad_norm": 8.17482852935791, + "learning_rate": 5.756410256410257e-06, + "loss": 0.7286, + "step": 993 + }, + { + "epoch": 8.495726495726496, + "grad_norm": 7.322425365447998, + "learning_rate": 5.7521367521367525e-06, + "loss": 0.8554, + "step": 994 + }, + { + "epoch": 8.504273504273504, + "grad_norm": 4.249075889587402, + "learning_rate": 5.7478632478632475e-06, + "loss": 0.2442, + "step": 995 + }, + { + "epoch": 8.512820512820513, + "grad_norm": 4.157267093658447, + "learning_rate": 5.743589743589743e-06, + "loss": 0.4207, + "step": 996 + }, + { + "epoch": 8.521367521367521, + "grad_norm": 4.118504047393799, + "learning_rate": 5.739316239316239e-06, + "loss": 0.1411, + "step": 997 + }, + { + "epoch": 8.52991452991453, + "grad_norm": 7.273322105407715, + "learning_rate": 5.735042735042735e-06, + "loss": 0.6269, + "step": 998 + }, + { + "epoch": 8.538461538461538, + "grad_norm": 4.7668633460998535, + "learning_rate": 5.730769230769231e-06, + "loss": 0.1894, + "step": 999 + }, + { + "epoch": 8.547008547008547, + "grad_norm": 5.869007110595703, + "learning_rate": 5.726495726495727e-06, + "loss": 0.7301, + "step": 1000 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 5.987617015838623, + "learning_rate": 5.722222222222222e-06, + "loss": 0.29, + "step": 1001 + }, + { + "epoch": 8.564102564102564, + "grad_norm": 5.445812702178955, + "learning_rate": 5.717948717948718e-06, + "loss": 0.4278, + "step": 1002 + }, + { + "epoch": 8.572649572649572, + "grad_norm": 4.7509002685546875, + "learning_rate": 5.713675213675214e-06, + "loss": 0.3396, + "step": 1003 + }, + { + "epoch": 8.581196581196581, + "grad_norm": 5.584397315979004, + "learning_rate": 5.70940170940171e-06, + "loss": 0.1329, + "step": 1004 + }, + { + "epoch": 8.58974358974359, + "grad_norm": 4.627229690551758, + "learning_rate": 5.705128205128206e-06, + "loss": 0.3012, + "step": 1005 + }, + { + "epoch": 8.598290598290598, + "grad_norm": 7.724045276641846, + "learning_rate": 5.7008547008547015e-06, + "loss": 0.4876, + "step": 1006 + }, + { + "epoch": 8.606837606837606, + "grad_norm": 3.488499164581299, + "learning_rate": 5.6965811965811965e-06, + "loss": 0.2025, + "step": 1007 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 14.487537384033203, + "learning_rate": 5.692307692307692e-06, + "loss": 0.6795, + "step": 1008 + }, + { + "epoch": 8.623931623931623, + "grad_norm": 4.03059196472168, + "learning_rate": 5.688034188034188e-06, + "loss": 0.2121, + "step": 1009 + }, + { + "epoch": 8.632478632478632, + "grad_norm": 3.278873920440674, + "learning_rate": 5.683760683760684e-06, + "loss": 0.3475, + "step": 1010 + }, + { + "epoch": 8.64102564102564, + "grad_norm": 4.599937915802002, + "learning_rate": 5.67948717948718e-06, + "loss": 0.2355, + "step": 1011 + }, + { + "epoch": 8.649572649572649, + "grad_norm": 6.314788818359375, + "learning_rate": 5.675213675213675e-06, + "loss": 0.2402, + "step": 1012 + }, + { + "epoch": 8.658119658119658, + "grad_norm": 3.4483532905578613, + "learning_rate": 5.670940170940171e-06, + "loss": 0.2189, + "step": 1013 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 299.8923645019531, + "learning_rate": 5.666666666666667e-06, + "loss": 1.0473, + "step": 1014 + }, + { + "epoch": 8.675213675213675, + "grad_norm": 13.14855670928955, + "learning_rate": 5.662393162393163e-06, + "loss": 0.3723, + "step": 1015 + }, + { + "epoch": 8.683760683760683, + "grad_norm": 6.513180732727051, + "learning_rate": 5.658119658119659e-06, + "loss": 0.483, + "step": 1016 + }, + { + "epoch": 8.692307692307692, + "grad_norm": 5.026037693023682, + "learning_rate": 5.6538461538461546e-06, + "loss": 0.4417, + "step": 1017 + }, + { + "epoch": 8.7008547008547, + "grad_norm": 176.535888671875, + "learning_rate": 5.64957264957265e-06, + "loss": 0.5256, + "step": 1018 + }, + { + "epoch": 8.709401709401709, + "grad_norm": 6.023639678955078, + "learning_rate": 5.6452991452991455e-06, + "loss": 0.3708, + "step": 1019 + }, + { + "epoch": 8.717948717948717, + "grad_norm": 16.64018440246582, + "learning_rate": 5.641025641025641e-06, + "loss": 0.8908, + "step": 1020 + }, + { + "epoch": 8.726495726495726, + "grad_norm": 2.9167582988739014, + "learning_rate": 5.636752136752137e-06, + "loss": 0.077, + "step": 1021 + }, + { + "epoch": 8.735042735042736, + "grad_norm": 3.368325710296631, + "learning_rate": 5.632478632478633e-06, + "loss": 0.2495, + "step": 1022 + }, + { + "epoch": 8.743589743589745, + "grad_norm": 3.7961905002593994, + "learning_rate": 5.628205128205129e-06, + "loss": 0.4427, + "step": 1023 + }, + { + "epoch": 8.752136752136753, + "grad_norm": 4.661024570465088, + "learning_rate": 5.623931623931624e-06, + "loss": 0.3092, + "step": 1024 + }, + { + "epoch": 8.760683760683762, + "grad_norm": 5.1971588134765625, + "learning_rate": 5.61965811965812e-06, + "loss": 0.2213, + "step": 1025 + }, + { + "epoch": 8.76923076923077, + "grad_norm": 4.427041530609131, + "learning_rate": 5.615384615384616e-06, + "loss": 0.2885, + "step": 1026 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 7.352906703948975, + "learning_rate": 5.611111111111112e-06, + "loss": 0.2689, + "step": 1027 + }, + { + "epoch": 8.786324786324787, + "grad_norm": 5.306934833526611, + "learning_rate": 5.606837606837608e-06, + "loss": 0.3758, + "step": 1028 + }, + { + "epoch": 8.794871794871796, + "grad_norm": 4.502418041229248, + "learning_rate": 5.602564102564103e-06, + "loss": 0.4655, + "step": 1029 + }, + { + "epoch": 8.803418803418804, + "grad_norm": 3.427734851837158, + "learning_rate": 5.598290598290599e-06, + "loss": 0.1145, + "step": 1030 + }, + { + "epoch": 8.811965811965813, + "grad_norm": 4.047433376312256, + "learning_rate": 5.5940170940170945e-06, + "loss": 0.1482, + "step": 1031 + }, + { + "epoch": 8.820512820512821, + "grad_norm": 3.6860435009002686, + "learning_rate": 5.58974358974359e-06, + "loss": 0.1152, + "step": 1032 + }, + { + "epoch": 8.82905982905983, + "grad_norm": 6.792733669281006, + "learning_rate": 5.585470085470086e-06, + "loss": 0.1732, + "step": 1033 + }, + { + "epoch": 8.837606837606838, + "grad_norm": 4.222206115722656, + "learning_rate": 5.581196581196582e-06, + "loss": 0.1259, + "step": 1034 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 4.376220703125, + "learning_rate": 5.576923076923077e-06, + "loss": 0.2403, + "step": 1035 + }, + { + "epoch": 8.854700854700855, + "grad_norm": 3.459076166152954, + "learning_rate": 5.572649572649573e-06, + "loss": 0.2064, + "step": 1036 + }, + { + "epoch": 8.863247863247864, + "grad_norm": 6.312697410583496, + "learning_rate": 5.568376068376069e-06, + "loss": 0.5076, + "step": 1037 + }, + { + "epoch": 8.871794871794872, + "grad_norm": 10.137848854064941, + "learning_rate": 5.564102564102565e-06, + "loss": 0.1649, + "step": 1038 + }, + { + "epoch": 8.88034188034188, + "grad_norm": 6.605007171630859, + "learning_rate": 5.559829059829061e-06, + "loss": 0.4233, + "step": 1039 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 3.9786465167999268, + "learning_rate": 5.555555555555557e-06, + "loss": 0.1801, + "step": 1040 + }, + { + "epoch": 8.897435897435898, + "grad_norm": 4.40491247177124, + "learning_rate": 5.551282051282052e-06, + "loss": 0.169, + "step": 1041 + }, + { + "epoch": 8.905982905982906, + "grad_norm": 4.719818592071533, + "learning_rate": 5.547008547008548e-06, + "loss": 0.1454, + "step": 1042 + }, + { + "epoch": 8.914529914529915, + "grad_norm": 2.384941577911377, + "learning_rate": 5.5427350427350435e-06, + "loss": 0.0723, + "step": 1043 + }, + { + "epoch": 8.923076923076923, + "grad_norm": 3.258315324783325, + "learning_rate": 5.538461538461539e-06, + "loss": 0.1023, + "step": 1044 + }, + { + "epoch": 8.931623931623932, + "grad_norm": 18.745052337646484, + "learning_rate": 5.534188034188035e-06, + "loss": 0.2673, + "step": 1045 + }, + { + "epoch": 8.94017094017094, + "grad_norm": 3.788177967071533, + "learning_rate": 5.52991452991453e-06, + "loss": 0.3173, + "step": 1046 + }, + { + "epoch": 8.948717948717949, + "grad_norm": 2.734895944595337, + "learning_rate": 5.525641025641026e-06, + "loss": 0.0834, + "step": 1047 + }, + { + "epoch": 8.957264957264957, + "grad_norm": 4.158284664154053, + "learning_rate": 5.521367521367522e-06, + "loss": 0.3414, + "step": 1048 + }, + { + "epoch": 8.965811965811966, + "grad_norm": 4.875148296356201, + "learning_rate": 5.517094017094018e-06, + "loss": 0.2729, + "step": 1049 + }, + { + "epoch": 8.974358974358974, + "grad_norm": 5.2556352615356445, + "learning_rate": 5.512820512820514e-06, + "loss": 0.1422, + "step": 1050 + }, + { + "epoch": 8.982905982905983, + "grad_norm": 3.817049980163574, + "learning_rate": 5.50854700854701e-06, + "loss": 0.2514, + "step": 1051 + }, + { + "epoch": 8.991452991452991, + "grad_norm": 2.247227668762207, + "learning_rate": 5.504273504273505e-06, + "loss": 0.0703, + "step": 1052 + }, + { + "epoch": 9.0, + "grad_norm": 34.36362838745117, + "learning_rate": 5.500000000000001e-06, + "loss": 0.7433, + "step": 1053 + }, + { + "epoch": 9.0, + "eval_loss": 0.12675683200359344, + "eval_runtime": 9.3141, + "eval_samples_per_second": 50.032, + "eval_steps_per_second": 6.334, + "step": 1053 + }, + { + "epoch": 9.008547008547009, + "grad_norm": 5.314228057861328, + "learning_rate": 5.495726495726497e-06, + "loss": 0.2576, + "step": 1054 + }, + { + "epoch": 9.017094017094017, + "grad_norm": 34.33782958984375, + "learning_rate": 5.4914529914529925e-06, + "loss": 0.3833, + "step": 1055 + }, + { + "epoch": 9.025641025641026, + "grad_norm": 5.440598964691162, + "learning_rate": 5.487179487179488e-06, + "loss": 0.3898, + "step": 1056 + }, + { + "epoch": 9.034188034188034, + "grad_norm": 3.561518907546997, + "learning_rate": 5.482905982905984e-06, + "loss": 0.2197, + "step": 1057 + }, + { + "epoch": 9.042735042735043, + "grad_norm": 4.7679762840271, + "learning_rate": 5.478632478632479e-06, + "loss": 0.3885, + "step": 1058 + }, + { + "epoch": 9.051282051282051, + "grad_norm": 4.694134712219238, + "learning_rate": 5.474358974358975e-06, + "loss": 0.2532, + "step": 1059 + }, + { + "epoch": 9.05982905982906, + "grad_norm": 4.347025394439697, + "learning_rate": 5.470085470085471e-06, + "loss": 0.1949, + "step": 1060 + }, + { + "epoch": 9.068376068376068, + "grad_norm": 4.064525127410889, + "learning_rate": 5.465811965811966e-06, + "loss": 0.1597, + "step": 1061 + }, + { + "epoch": 9.076923076923077, + "grad_norm": 3.78560471534729, + "learning_rate": 5.461538461538461e-06, + "loss": 0.18, + "step": 1062 + }, + { + "epoch": 9.085470085470085, + "grad_norm": 7.843743324279785, + "learning_rate": 5.457264957264957e-06, + "loss": 0.3146, + "step": 1063 + }, + { + "epoch": 9.094017094017094, + "grad_norm": 8.152037620544434, + "learning_rate": 5.452991452991453e-06, + "loss": 0.3384, + "step": 1064 + }, + { + "epoch": 9.102564102564102, + "grad_norm": 3.987872838973999, + "learning_rate": 5.448717948717949e-06, + "loss": 0.2071, + "step": 1065 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 3.478532552719116, + "learning_rate": 5.444444444444445e-06, + "loss": 0.1788, + "step": 1066 + }, + { + "epoch": 9.11965811965812, + "grad_norm": 3.6598286628723145, + "learning_rate": 5.44017094017094e-06, + "loss": 0.2459, + "step": 1067 + }, + { + "epoch": 9.128205128205128, + "grad_norm": 9.528829574584961, + "learning_rate": 5.435897435897436e-06, + "loss": 0.2046, + "step": 1068 + }, + { + "epoch": 9.136752136752136, + "grad_norm": 3.3274407386779785, + "learning_rate": 5.4316239316239316e-06, + "loss": 0.1414, + "step": 1069 + }, + { + "epoch": 9.145299145299145, + "grad_norm": 5.117324352264404, + "learning_rate": 5.4273504273504275e-06, + "loss": 0.3636, + "step": 1070 + }, + { + "epoch": 9.153846153846153, + "grad_norm": 8.604976654052734, + "learning_rate": 5.423076923076923e-06, + "loss": 0.2723, + "step": 1071 + }, + { + "epoch": 9.162393162393162, + "grad_norm": 72.67993927001953, + "learning_rate": 5.418803418803419e-06, + "loss": 0.5863, + "step": 1072 + }, + { + "epoch": 9.17094017094017, + "grad_norm": 3.8609094619750977, + "learning_rate": 5.414529914529914e-06, + "loss": 0.1778, + "step": 1073 + }, + { + "epoch": 9.179487179487179, + "grad_norm": 21.24209976196289, + "learning_rate": 5.41025641025641e-06, + "loss": 0.2062, + "step": 1074 + }, + { + "epoch": 9.188034188034187, + "grad_norm": 5.552285194396973, + "learning_rate": 5.405982905982906e-06, + "loss": 0.4685, + "step": 1075 + }, + { + "epoch": 9.196581196581196, + "grad_norm": 12.241254806518555, + "learning_rate": 5.401709401709402e-06, + "loss": 0.4309, + "step": 1076 + }, + { + "epoch": 9.205128205128204, + "grad_norm": 3.6276049613952637, + "learning_rate": 5.397435897435898e-06, + "loss": 0.0924, + "step": 1077 + }, + { + "epoch": 9.213675213675213, + "grad_norm": 10.98838996887207, + "learning_rate": 5.393162393162394e-06, + "loss": 0.7616, + "step": 1078 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 4.689146041870117, + "learning_rate": 5.388888888888889e-06, + "loss": 0.346, + "step": 1079 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 6.385439872741699, + "learning_rate": 5.384615384615385e-06, + "loss": 0.2945, + "step": 1080 + }, + { + "epoch": 9.239316239316238, + "grad_norm": 2.4931023120880127, + "learning_rate": 5.3803418803418806e-06, + "loss": 0.172, + "step": 1081 + }, + { + "epoch": 9.247863247863247, + "grad_norm": 3.797539472579956, + "learning_rate": 5.3760683760683764e-06, + "loss": 0.0927, + "step": 1082 + }, + { + "epoch": 9.256410256410255, + "grad_norm": 2.7136716842651367, + "learning_rate": 5.371794871794872e-06, + "loss": 0.0932, + "step": 1083 + }, + { + "epoch": 9.264957264957266, + "grad_norm": 5.207858085632324, + "learning_rate": 5.367521367521367e-06, + "loss": 0.1176, + "step": 1084 + }, + { + "epoch": 9.273504273504274, + "grad_norm": 3.95009183883667, + "learning_rate": 5.363247863247863e-06, + "loss": 0.3045, + "step": 1085 + }, + { + "epoch": 9.282051282051283, + "grad_norm": 1.9097685813903809, + "learning_rate": 5.358974358974359e-06, + "loss": 0.1793, + "step": 1086 + }, + { + "epoch": 9.290598290598291, + "grad_norm": 3.205216407775879, + "learning_rate": 5.354700854700855e-06, + "loss": 0.1071, + "step": 1087 + }, + { + "epoch": 9.2991452991453, + "grad_norm": 3.481822967529297, + "learning_rate": 5.350427350427351e-06, + "loss": 0.3885, + "step": 1088 + }, + { + "epoch": 9.307692307692308, + "grad_norm": 11.802562713623047, + "learning_rate": 5.346153846153847e-06, + "loss": 0.1769, + "step": 1089 + }, + { + "epoch": 9.316239316239317, + "grad_norm": 3.101505994796753, + "learning_rate": 5.341880341880342e-06, + "loss": 0.1265, + "step": 1090 + }, + { + "epoch": 9.324786324786325, + "grad_norm": 5.163032054901123, + "learning_rate": 5.337606837606838e-06, + "loss": 0.4768, + "step": 1091 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 1.8217605352401733, + "learning_rate": 5.333333333333334e-06, + "loss": 0.053, + "step": 1092 + }, + { + "epoch": 9.341880341880342, + "grad_norm": 2.6139562129974365, + "learning_rate": 5.3290598290598295e-06, + "loss": 0.0848, + "step": 1093 + }, + { + "epoch": 9.350427350427351, + "grad_norm": 3.1172311305999756, + "learning_rate": 5.3247863247863254e-06, + "loss": 0.1076, + "step": 1094 + }, + { + "epoch": 9.35897435897436, + "grad_norm": 5.907342433929443, + "learning_rate": 5.320512820512821e-06, + "loss": 0.1737, + "step": 1095 + }, + { + "epoch": 9.367521367521368, + "grad_norm": 45.74967575073242, + "learning_rate": 5.316239316239316e-06, + "loss": 0.2455, + "step": 1096 + }, + { + "epoch": 9.376068376068377, + "grad_norm": 3.1865549087524414, + "learning_rate": 5.311965811965812e-06, + "loss": 0.2236, + "step": 1097 + }, + { + "epoch": 9.384615384615385, + "grad_norm": 4.028379917144775, + "learning_rate": 5.307692307692308e-06, + "loss": 0.1065, + "step": 1098 + }, + { + "epoch": 9.393162393162394, + "grad_norm": 5.388605117797852, + "learning_rate": 5.303418803418804e-06, + "loss": 0.2967, + "step": 1099 + }, + { + "epoch": 9.401709401709402, + "grad_norm": 3.661736249923706, + "learning_rate": 5.2991452991453e-06, + "loss": 0.1271, + "step": 1100 + }, + { + "epoch": 9.41025641025641, + "grad_norm": 4.693649768829346, + "learning_rate": 5.294871794871795e-06, + "loss": 0.7891, + "step": 1101 + }, + { + "epoch": 9.418803418803419, + "grad_norm": 14.75247573852539, + "learning_rate": 5.290598290598291e-06, + "loss": 0.707, + "step": 1102 + }, + { + "epoch": 9.427350427350428, + "grad_norm": 5.123616695404053, + "learning_rate": 5.286324786324787e-06, + "loss": 0.2424, + "step": 1103 + }, + { + "epoch": 9.435897435897436, + "grad_norm": 5.946259021759033, + "learning_rate": 5.282051282051283e-06, + "loss": 0.2558, + "step": 1104 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 3.3757872581481934, + "learning_rate": 5.2777777777777785e-06, + "loss": 0.072, + "step": 1105 + }, + { + "epoch": 9.452991452991453, + "grad_norm": 4.639676094055176, + "learning_rate": 5.2735042735042744e-06, + "loss": 0.1483, + "step": 1106 + }, + { + "epoch": 9.461538461538462, + "grad_norm": 5.552156925201416, + "learning_rate": 5.2692307692307695e-06, + "loss": 0.341, + "step": 1107 + }, + { + "epoch": 9.47008547008547, + "grad_norm": 10.601661682128906, + "learning_rate": 5.264957264957265e-06, + "loss": 0.5964, + "step": 1108 + }, + { + "epoch": 9.478632478632479, + "grad_norm": 4.391530513763428, + "learning_rate": 5.260683760683761e-06, + "loss": 0.2346, + "step": 1109 + }, + { + "epoch": 9.487179487179487, + "grad_norm": 3.150240659713745, + "learning_rate": 5.256410256410257e-06, + "loss": 0.1, + "step": 1110 + }, + { + "epoch": 9.495726495726496, + "grad_norm": 5.60894775390625, + "learning_rate": 5.252136752136753e-06, + "loss": 0.397, + "step": 1111 + }, + { + "epoch": 9.504273504273504, + "grad_norm": 9.21768856048584, + "learning_rate": 5.247863247863249e-06, + "loss": 0.2292, + "step": 1112 + }, + { + "epoch": 9.512820512820513, + "grad_norm": 8.351348876953125, + "learning_rate": 5.243589743589744e-06, + "loss": 0.3129, + "step": 1113 + }, + { + "epoch": 9.521367521367521, + "grad_norm": 3.0813419818878174, + "learning_rate": 5.23931623931624e-06, + "loss": 0.2539, + "step": 1114 + }, + { + "epoch": 9.52991452991453, + "grad_norm": 5.553039073944092, + "learning_rate": 5.235042735042736e-06, + "loss": 0.1121, + "step": 1115 + }, + { + "epoch": 9.538461538461538, + "grad_norm": 3.973057746887207, + "learning_rate": 5.230769230769232e-06, + "loss": 0.4928, + "step": 1116 + }, + { + "epoch": 9.547008547008547, + "grad_norm": 4.753414630889893, + "learning_rate": 5.2264957264957275e-06, + "loss": 0.2247, + "step": 1117 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 7.344094753265381, + "learning_rate": 5.2222222222222226e-06, + "loss": 0.1405, + "step": 1118 + }, + { + "epoch": 9.564102564102564, + "grad_norm": 47.83219528198242, + "learning_rate": 5.2179487179487185e-06, + "loss": 0.3108, + "step": 1119 + }, + { + "epoch": 9.572649572649572, + "grad_norm": 2.31591796875, + "learning_rate": 5.213675213675214e-06, + "loss": 0.1019, + "step": 1120 + }, + { + "epoch": 9.581196581196581, + "grad_norm": 3.871413230895996, + "learning_rate": 5.20940170940171e-06, + "loss": 0.2562, + "step": 1121 + }, + { + "epoch": 9.58974358974359, + "grad_norm": 2.1789255142211914, + "learning_rate": 5.205128205128206e-06, + "loss": 0.0571, + "step": 1122 + }, + { + "epoch": 9.598290598290598, + "grad_norm": 4.119174957275391, + "learning_rate": 5.200854700854702e-06, + "loss": 0.2799, + "step": 1123 + }, + { + "epoch": 9.606837606837606, + "grad_norm": 7.873704433441162, + "learning_rate": 5.196581196581197e-06, + "loss": 0.2154, + "step": 1124 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 3.386780023574829, + "learning_rate": 5.192307692307693e-06, + "loss": 0.1607, + "step": 1125 + }, + { + "epoch": 9.623931623931623, + "grad_norm": 3.3607964515686035, + "learning_rate": 5.188034188034189e-06, + "loss": 0.22, + "step": 1126 + }, + { + "epoch": 9.632478632478632, + "grad_norm": 10.655082702636719, + "learning_rate": 5.183760683760685e-06, + "loss": 0.2102, + "step": 1127 + }, + { + "epoch": 9.64102564102564, + "grad_norm": 5.550488471984863, + "learning_rate": 5.179487179487181e-06, + "loss": 0.347, + "step": 1128 + }, + { + "epoch": 9.649572649572649, + "grad_norm": 4.184569835662842, + "learning_rate": 5.1752136752136765e-06, + "loss": 0.183, + "step": 1129 + }, + { + "epoch": 9.658119658119658, + "grad_norm": 4.892969131469727, + "learning_rate": 5.1709401709401716e-06, + "loss": 0.2896, + "step": 1130 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 5.926670074462891, + "learning_rate": 5.1666666666666675e-06, + "loss": 0.3321, + "step": 1131 + }, + { + "epoch": 9.675213675213675, + "grad_norm": 11.719461441040039, + "learning_rate": 5.162393162393163e-06, + "loss": 0.4055, + "step": 1132 + }, + { + "epoch": 9.683760683760683, + "grad_norm": 3.5666840076446533, + "learning_rate": 5.158119658119659e-06, + "loss": 0.2318, + "step": 1133 + }, + { + "epoch": 9.692307692307692, + "grad_norm": 6.800848484039307, + "learning_rate": 5.1538461538461534e-06, + "loss": 0.1202, + "step": 1134 + }, + { + "epoch": 9.7008547008547, + "grad_norm": 4.50139856338501, + "learning_rate": 5.149572649572649e-06, + "loss": 0.1914, + "step": 1135 + }, + { + "epoch": 9.709401709401709, + "grad_norm": 2.599607467651367, + "learning_rate": 5.145299145299145e-06, + "loss": 0.0833, + "step": 1136 + }, + { + "epoch": 9.717948717948717, + "grad_norm": 6.084483623504639, + "learning_rate": 5.141025641025641e-06, + "loss": 0.0907, + "step": 1137 + }, + { + "epoch": 9.726495726495726, + "grad_norm": 4.542915344238281, + "learning_rate": 5.136752136752137e-06, + "loss": 0.4554, + "step": 1138 + }, + { + "epoch": 9.735042735042736, + "grad_norm": 3.871166229248047, + "learning_rate": 5.132478632478632e-06, + "loss": 0.3037, + "step": 1139 + }, + { + "epoch": 9.743589743589745, + "grad_norm": 5.121057033538818, + "learning_rate": 5.128205128205128e-06, + "loss": 0.1751, + "step": 1140 + }, + { + "epoch": 9.752136752136753, + "grad_norm": 3.7517125606536865, + "learning_rate": 5.123931623931624e-06, + "loss": 0.3144, + "step": 1141 + }, + { + "epoch": 9.760683760683762, + "grad_norm": 1.7604278326034546, + "learning_rate": 5.11965811965812e-06, + "loss": 0.0649, + "step": 1142 + }, + { + "epoch": 9.76923076923077, + "grad_norm": 13.68947982788086, + "learning_rate": 5.115384615384616e-06, + "loss": 0.2184, + "step": 1143 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 5.716836452484131, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.1876, + "step": 1144 + }, + { + "epoch": 9.786324786324787, + "grad_norm": 8.21943187713623, + "learning_rate": 5.1068376068376065e-06, + "loss": 0.349, + "step": 1145 + }, + { + "epoch": 9.794871794871796, + "grad_norm": 5.270402908325195, + "learning_rate": 5.1025641025641024e-06, + "loss": 0.4442, + "step": 1146 + }, + { + "epoch": 9.803418803418804, + "grad_norm": 2.3825948238372803, + "learning_rate": 5.098290598290598e-06, + "loss": 0.2237, + "step": 1147 + }, + { + "epoch": 9.811965811965813, + "grad_norm": 11.812047958374023, + "learning_rate": 5.094017094017094e-06, + "loss": 0.5122, + "step": 1148 + }, + { + "epoch": 9.820512820512821, + "grad_norm": 9.14202880859375, + "learning_rate": 5.08974358974359e-06, + "loss": 0.3407, + "step": 1149 + }, + { + "epoch": 9.82905982905983, + "grad_norm": 5.273305892944336, + "learning_rate": 5.085470085470086e-06, + "loss": 0.1702, + "step": 1150 + }, + { + "epoch": 9.837606837606838, + "grad_norm": 2.995126485824585, + "learning_rate": 5.081196581196581e-06, + "loss": 0.228, + "step": 1151 + }, + { + "epoch": 9.846153846153847, + "grad_norm": 4.077675819396973, + "learning_rate": 5.076923076923077e-06, + "loss": 0.4022, + "step": 1152 + }, + { + "epoch": 9.854700854700855, + "grad_norm": 2.1732425689697266, + "learning_rate": 5.072649572649573e-06, + "loss": 0.1178, + "step": 1153 + }, + { + "epoch": 9.863247863247864, + "grad_norm": 2.905172109603882, + "learning_rate": 5.068376068376069e-06, + "loss": 0.1718, + "step": 1154 + }, + { + "epoch": 9.871794871794872, + "grad_norm": 2.702521324157715, + "learning_rate": 5.064102564102565e-06, + "loss": 0.1488, + "step": 1155 + }, + { + "epoch": 9.88034188034188, + "grad_norm": 2.414088487625122, + "learning_rate": 5.05982905982906e-06, + "loss": 0.1034, + "step": 1156 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 2.618173360824585, + "learning_rate": 5.0555555555555555e-06, + "loss": 0.0783, + "step": 1157 + }, + { + "epoch": 9.897435897435898, + "grad_norm": 5.002628803253174, + "learning_rate": 5.051282051282051e-06, + "loss": 0.1195, + "step": 1158 + }, + { + "epoch": 9.905982905982906, + "grad_norm": 2.84708833694458, + "learning_rate": 5.047008547008547e-06, + "loss": 0.0906, + "step": 1159 + }, + { + "epoch": 9.914529914529915, + "grad_norm": 5.564020156860352, + "learning_rate": 5.042735042735043e-06, + "loss": 0.2037, + "step": 1160 + }, + { + "epoch": 9.923076923076923, + "grad_norm": 3.7763166427612305, + "learning_rate": 5.038461538461539e-06, + "loss": 0.2067, + "step": 1161 + }, + { + "epoch": 9.931623931623932, + "grad_norm": 2.67268705368042, + "learning_rate": 5.034188034188034e-06, + "loss": 0.0557, + "step": 1162 + }, + { + "epoch": 9.94017094017094, + "grad_norm": 2.4144680500030518, + "learning_rate": 5.02991452991453e-06, + "loss": 0.194, + "step": 1163 + }, + { + "epoch": 9.948717948717949, + "grad_norm": 2.0716731548309326, + "learning_rate": 5.025641025641026e-06, + "loss": 0.1253, + "step": 1164 + }, + { + "epoch": 9.957264957264957, + "grad_norm": 13.20478630065918, + "learning_rate": 5.021367521367522e-06, + "loss": 0.268, + "step": 1165 + }, + { + "epoch": 9.965811965811966, + "grad_norm": 2.093698263168335, + "learning_rate": 5.017094017094018e-06, + "loss": 0.0738, + "step": 1166 + }, + { + "epoch": 9.974358974358974, + "grad_norm": 2.2758119106292725, + "learning_rate": 5.012820512820514e-06, + "loss": 0.0804, + "step": 1167 + }, + { + "epoch": 9.982905982905983, + "grad_norm": 21.843395233154297, + "learning_rate": 5.008547008547009e-06, + "loss": 0.3298, + "step": 1168 + }, + { + "epoch": 9.991452991452991, + "grad_norm": 3.0435073375701904, + "learning_rate": 5.0042735042735045e-06, + "loss": 0.1318, + "step": 1169 + }, + { + "epoch": 10.0, + "grad_norm": 8.449163436889648, + "learning_rate": 5e-06, + "loss": 0.1725, + "step": 1170 + }, + { + "epoch": 10.0, + "eval_loss": 0.10285739600658417, + "eval_runtime": 9.2384, + "eval_samples_per_second": 50.441, + "eval_steps_per_second": 6.386, + "step": 1170 + }, + { + "epoch": 10.008547008547009, + "grad_norm": 4.151456356048584, + "learning_rate": 4.995726495726496e-06, + "loss": 0.3336, + "step": 1171 + }, + { + "epoch": 10.017094017094017, + "grad_norm": 2.38647723197937, + "learning_rate": 4.991452991452992e-06, + "loss": 0.1138, + "step": 1172 + }, + { + "epoch": 10.025641025641026, + "grad_norm": 4.44817590713501, + "learning_rate": 4.987179487179487e-06, + "loss": 0.0954, + "step": 1173 + }, + { + "epoch": 10.034188034188034, + "grad_norm": 2.6213347911834717, + "learning_rate": 4.982905982905983e-06, + "loss": 0.0695, + "step": 1174 + }, + { + "epoch": 10.042735042735043, + "grad_norm": 4.664891719818115, + "learning_rate": 4.978632478632479e-06, + "loss": 0.1067, + "step": 1175 + }, + { + "epoch": 10.051282051282051, + "grad_norm": 1.7059048414230347, + "learning_rate": 4.974358974358975e-06, + "loss": 0.0321, + "step": 1176 + }, + { + "epoch": 10.05982905982906, + "grad_norm": 5.123709678649902, + "learning_rate": 4.970085470085471e-06, + "loss": 0.2117, + "step": 1177 + }, + { + "epoch": 10.068376068376068, + "grad_norm": 2.2717695236206055, + "learning_rate": 4.965811965811967e-06, + "loss": 0.2187, + "step": 1178 + }, + { + "epoch": 10.076923076923077, + "grad_norm": 4.669886112213135, + "learning_rate": 4.961538461538462e-06, + "loss": 0.4615, + "step": 1179 + }, + { + "epoch": 10.085470085470085, + "grad_norm": 18.739727020263672, + "learning_rate": 4.957264957264958e-06, + "loss": 0.3431, + "step": 1180 + }, + { + "epoch": 10.094017094017094, + "grad_norm": 7.798559188842773, + "learning_rate": 4.9529914529914535e-06, + "loss": 0.2483, + "step": 1181 + }, + { + "epoch": 10.102564102564102, + "grad_norm": 22.59453773498535, + "learning_rate": 4.948717948717949e-06, + "loss": 0.15, + "step": 1182 + }, + { + "epoch": 10.11111111111111, + "grad_norm": 2.5734364986419678, + "learning_rate": 4.944444444444445e-06, + "loss": 0.0465, + "step": 1183 + }, + { + "epoch": 10.11965811965812, + "grad_norm": 3.1944875717163086, + "learning_rate": 4.940170940170941e-06, + "loss": 0.1429, + "step": 1184 + }, + { + "epoch": 10.128205128205128, + "grad_norm": 1.6943906545639038, + "learning_rate": 4.935897435897436e-06, + "loss": 0.0685, + "step": 1185 + }, + { + "epoch": 10.136752136752136, + "grad_norm": 4.497282981872559, + "learning_rate": 4.931623931623932e-06, + "loss": 0.2113, + "step": 1186 + }, + { + "epoch": 10.145299145299145, + "grad_norm": 2.9377167224884033, + "learning_rate": 4.927350427350428e-06, + "loss": 0.1352, + "step": 1187 + }, + { + "epoch": 10.153846153846153, + "grad_norm": 8.528215408325195, + "learning_rate": 4.923076923076924e-06, + "loss": 0.3268, + "step": 1188 + }, + { + "epoch": 10.162393162393162, + "grad_norm": 2.143850803375244, + "learning_rate": 4.918803418803419e-06, + "loss": 0.0923, + "step": 1189 + }, + { + "epoch": 10.17094017094017, + "grad_norm": 3.921250343322754, + "learning_rate": 4.914529914529915e-06, + "loss": 0.1451, + "step": 1190 + }, + { + "epoch": 10.179487179487179, + "grad_norm": 10.713285446166992, + "learning_rate": 4.910256410256411e-06, + "loss": 0.17, + "step": 1191 + }, + { + "epoch": 10.188034188034187, + "grad_norm": 2.450204849243164, + "learning_rate": 4.905982905982906e-06, + "loss": 0.0765, + "step": 1192 + }, + { + "epoch": 10.196581196581196, + "grad_norm": 4.750647068023682, + "learning_rate": 4.901709401709402e-06, + "loss": 0.2829, + "step": 1193 + }, + { + "epoch": 10.205128205128204, + "grad_norm": 12.714463233947754, + "learning_rate": 4.8974358974358975e-06, + "loss": 0.6767, + "step": 1194 + }, + { + "epoch": 10.213675213675213, + "grad_norm": 6.759951591491699, + "learning_rate": 4.8931623931623934e-06, + "loss": 0.2369, + "step": 1195 + }, + { + "epoch": 10.222222222222221, + "grad_norm": 8.592784881591797, + "learning_rate": 4.888888888888889e-06, + "loss": 0.4203, + "step": 1196 + }, + { + "epoch": 10.23076923076923, + "grad_norm": 5.04047155380249, + "learning_rate": 4.884615384615385e-06, + "loss": 0.1023, + "step": 1197 + }, + { + "epoch": 10.239316239316238, + "grad_norm": 38.112152099609375, + "learning_rate": 4.88034188034188e-06, + "loss": 0.4686, + "step": 1198 + }, + { + "epoch": 10.247863247863247, + "grad_norm": 6.751104354858398, + "learning_rate": 4.876068376068376e-06, + "loss": 0.085, + "step": 1199 + }, + { + "epoch": 10.256410256410255, + "grad_norm": 4.3117594718933105, + "learning_rate": 4.871794871794872e-06, + "loss": 0.1504, + "step": 1200 + }, + { + "epoch": 10.264957264957266, + "grad_norm": 2.251265287399292, + "learning_rate": 4.867521367521368e-06, + "loss": 0.1664, + "step": 1201 + }, + { + "epoch": 10.273504273504274, + "grad_norm": 2.1650373935699463, + "learning_rate": 4.863247863247864e-06, + "loss": 0.0959, + "step": 1202 + }, + { + "epoch": 10.282051282051283, + "grad_norm": 2.5863089561462402, + "learning_rate": 4.85897435897436e-06, + "loss": 0.1148, + "step": 1203 + }, + { + "epoch": 10.290598290598291, + "grad_norm": 1.974357008934021, + "learning_rate": 4.854700854700855e-06, + "loss": 0.0663, + "step": 1204 + }, + { + "epoch": 10.2991452991453, + "grad_norm": 2.3226940631866455, + "learning_rate": 4.850427350427351e-06, + "loss": 0.1363, + "step": 1205 + }, + { + "epoch": 10.307692307692308, + "grad_norm": 4.034085750579834, + "learning_rate": 4.8461538461538465e-06, + "loss": 0.3473, + "step": 1206 + }, + { + "epoch": 10.316239316239317, + "grad_norm": 2.492307186126709, + "learning_rate": 4.8418803418803424e-06, + "loss": 0.1742, + "step": 1207 + }, + { + "epoch": 10.324786324786325, + "grad_norm": 2.886432409286499, + "learning_rate": 4.837606837606838e-06, + "loss": 0.1382, + "step": 1208 + }, + { + "epoch": 10.333333333333334, + "grad_norm": 3.6314749717712402, + "learning_rate": 4.833333333333333e-06, + "loss": 0.1556, + "step": 1209 + }, + { + "epoch": 10.341880341880342, + "grad_norm": 2.2757928371429443, + "learning_rate": 4.829059829059829e-06, + "loss": 0.0434, + "step": 1210 + }, + { + "epoch": 10.350427350427351, + "grad_norm": 3.4152615070343018, + "learning_rate": 4.824786324786325e-06, + "loss": 0.2903, + "step": 1211 + }, + { + "epoch": 10.35897435897436, + "grad_norm": 3.873960256576538, + "learning_rate": 4.820512820512821e-06, + "loss": 0.2611, + "step": 1212 + }, + { + "epoch": 10.367521367521368, + "grad_norm": 4.2241291999816895, + "learning_rate": 4.816239316239317e-06, + "loss": 0.0954, + "step": 1213 + }, + { + "epoch": 10.376068376068377, + "grad_norm": 5.454725742340088, + "learning_rate": 4.811965811965813e-06, + "loss": 0.1361, + "step": 1214 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 3.482558012008667, + "learning_rate": 4.807692307692308e-06, + "loss": 0.0861, + "step": 1215 + }, + { + "epoch": 10.393162393162394, + "grad_norm": 2.301254987716675, + "learning_rate": 4.803418803418804e-06, + "loss": 0.1571, + "step": 1216 + }, + { + "epoch": 10.401709401709402, + "grad_norm": 6.0665602684021, + "learning_rate": 4.7991452991453e-06, + "loss": 0.5323, + "step": 1217 + }, + { + "epoch": 10.41025641025641, + "grad_norm": 3.6052770614624023, + "learning_rate": 4.7948717948717955e-06, + "loss": 0.3789, + "step": 1218 + }, + { + "epoch": 10.418803418803419, + "grad_norm": 3.9434757232666016, + "learning_rate": 4.790598290598291e-06, + "loss": 0.0605, + "step": 1219 + }, + { + "epoch": 10.427350427350428, + "grad_norm": 5.260069847106934, + "learning_rate": 4.786324786324787e-06, + "loss": 0.3163, + "step": 1220 + }, + { + "epoch": 10.435897435897436, + "grad_norm": 5.219394207000732, + "learning_rate": 4.782051282051282e-06, + "loss": 0.4339, + "step": 1221 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 2.7057230472564697, + "learning_rate": 4.777777777777778e-06, + "loss": 0.0787, + "step": 1222 + }, + { + "epoch": 10.452991452991453, + "grad_norm": 11.005247116088867, + "learning_rate": 4.773504273504274e-06, + "loss": 0.255, + "step": 1223 + }, + { + "epoch": 10.461538461538462, + "grad_norm": 1.7238801717758179, + "learning_rate": 4.76923076923077e-06, + "loss": 0.0605, + "step": 1224 + }, + { + "epoch": 10.47008547008547, + "grad_norm": 6.509312629699707, + "learning_rate": 4.764957264957265e-06, + "loss": 0.2899, + "step": 1225 + }, + { + "epoch": 10.478632478632479, + "grad_norm": 7.1476359367370605, + "learning_rate": 4.760683760683761e-06, + "loss": 0.336, + "step": 1226 + }, + { + "epoch": 10.487179487179487, + "grad_norm": 15.92902660369873, + "learning_rate": 4.756410256410257e-06, + "loss": 0.4864, + "step": 1227 + }, + { + "epoch": 10.495726495726496, + "grad_norm": 5.545684337615967, + "learning_rate": 4.752136752136752e-06, + "loss": 0.4741, + "step": 1228 + }, + { + "epoch": 10.504273504273504, + "grad_norm": 3.2521066665649414, + "learning_rate": 4.747863247863248e-06, + "loss": 0.0894, + "step": 1229 + }, + { + "epoch": 10.512820512820513, + "grad_norm": 2.696866512298584, + "learning_rate": 4.743589743589744e-06, + "loss": 0.111, + "step": 1230 + }, + { + "epoch": 10.521367521367521, + "grad_norm": 1.8362340927124023, + "learning_rate": 4.7393162393162396e-06, + "loss": 0.0579, + "step": 1231 + }, + { + "epoch": 10.52991452991453, + "grad_norm": 2.96872878074646, + "learning_rate": 4.7350427350427355e-06, + "loss": 0.0781, + "step": 1232 + }, + { + "epoch": 10.538461538461538, + "grad_norm": 1.5503445863723755, + "learning_rate": 4.730769230769231e-06, + "loss": 0.0451, + "step": 1233 + }, + { + "epoch": 10.547008547008547, + "grad_norm": 3.9600377082824707, + "learning_rate": 4.726495726495726e-06, + "loss": 0.1721, + "step": 1234 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 3.3868823051452637, + "learning_rate": 4.722222222222222e-06, + "loss": 0.1803, + "step": 1235 + }, + { + "epoch": 10.564102564102564, + "grad_norm": 2.528111219406128, + "learning_rate": 4.717948717948718e-06, + "loss": 0.238, + "step": 1236 + }, + { + "epoch": 10.572649572649572, + "grad_norm": 6.960350036621094, + "learning_rate": 4.713675213675214e-06, + "loss": 0.4353, + "step": 1237 + }, + { + "epoch": 10.581196581196581, + "grad_norm": 2.3169686794281006, + "learning_rate": 4.70940170940171e-06, + "loss": 0.1891, + "step": 1238 + }, + { + "epoch": 10.58974358974359, + "grad_norm": 2.021212577819824, + "learning_rate": 4.705128205128206e-06, + "loss": 0.0865, + "step": 1239 + }, + { + "epoch": 10.598290598290598, + "grad_norm": 2.445462942123413, + "learning_rate": 4.700854700854701e-06, + "loss": 0.0973, + "step": 1240 + }, + { + "epoch": 10.606837606837606, + "grad_norm": 3.4490067958831787, + "learning_rate": 4.696581196581197e-06, + "loss": 0.1419, + "step": 1241 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 3.2859914302825928, + "learning_rate": 4.692307692307693e-06, + "loss": 0.1587, + "step": 1242 + }, + { + "epoch": 10.623931623931623, + "grad_norm": 4.754831790924072, + "learning_rate": 4.6880341880341886e-06, + "loss": 0.2537, + "step": 1243 + }, + { + "epoch": 10.632478632478632, + "grad_norm": 3.220867156982422, + "learning_rate": 4.6837606837606844e-06, + "loss": 0.0941, + "step": 1244 + }, + { + "epoch": 10.64102564102564, + "grad_norm": 5.699328422546387, + "learning_rate": 4.6794871794871795e-06, + "loss": 0.255, + "step": 1245 + }, + { + "epoch": 10.649572649572649, + "grad_norm": 1.5174522399902344, + "learning_rate": 4.675213675213675e-06, + "loss": 0.048, + "step": 1246 + }, + { + "epoch": 10.658119658119658, + "grad_norm": 2.4277050495147705, + "learning_rate": 4.670940170940171e-06, + "loss": 0.1127, + "step": 1247 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 2.079031229019165, + "learning_rate": 4.666666666666667e-06, + "loss": 0.1038, + "step": 1248 + }, + { + "epoch": 10.675213675213675, + "grad_norm": 953.4605102539062, + "learning_rate": 4.662393162393163e-06, + "loss": 1.1892, + "step": 1249 + }, + { + "epoch": 10.683760683760683, + "grad_norm": 9.190105438232422, + "learning_rate": 4.658119658119659e-06, + "loss": 0.3541, + "step": 1250 + }, + { + "epoch": 10.692307692307692, + "grad_norm": 2.3222947120666504, + "learning_rate": 4.653846153846154e-06, + "loss": 0.0842, + "step": 1251 + }, + { + "epoch": 10.7008547008547, + "grad_norm": 2.2312700748443604, + "learning_rate": 4.64957264957265e-06, + "loss": 0.088, + "step": 1252 + }, + { + "epoch": 10.709401709401709, + "grad_norm": 3.987630844116211, + "learning_rate": 4.645299145299146e-06, + "loss": 0.1667, + "step": 1253 + }, + { + "epoch": 10.717948717948717, + "grad_norm": 5.108981609344482, + "learning_rate": 4.641025641025642e-06, + "loss": 0.4291, + "step": 1254 + }, + { + "epoch": 10.726495726495726, + "grad_norm": 2.8597464561462402, + "learning_rate": 4.6367521367521375e-06, + "loss": 0.0564, + "step": 1255 + }, + { + "epoch": 10.735042735042736, + "grad_norm": 2.3642940521240234, + "learning_rate": 4.6324786324786334e-06, + "loss": 0.0909, + "step": 1256 + }, + { + "epoch": 10.743589743589745, + "grad_norm": 1.5703462362289429, + "learning_rate": 4.6282051282051285e-06, + "loss": 0.0395, + "step": 1257 + }, + { + "epoch": 10.752136752136753, + "grad_norm": 2.952786922454834, + "learning_rate": 4.623931623931624e-06, + "loss": 0.1824, + "step": 1258 + }, + { + "epoch": 10.760683760683762, + "grad_norm": 2.9027185440063477, + "learning_rate": 4.61965811965812e-06, + "loss": 0.0765, + "step": 1259 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 2.4386038780212402, + "learning_rate": 4.615384615384616e-06, + "loss": 0.2761, + "step": 1260 + }, + { + "epoch": 10.777777777777779, + "grad_norm": 7.146468639373779, + "learning_rate": 4.611111111111112e-06, + "loss": 0.4427, + "step": 1261 + }, + { + "epoch": 10.786324786324787, + "grad_norm": 2.002096652984619, + "learning_rate": 4.606837606837607e-06, + "loss": 0.0879, + "step": 1262 + }, + { + "epoch": 10.794871794871796, + "grad_norm": 6.504697322845459, + "learning_rate": 4.602564102564103e-06, + "loss": 0.1805, + "step": 1263 + }, + { + "epoch": 10.803418803418804, + "grad_norm": 9.748340606689453, + "learning_rate": 4.598290598290598e-06, + "loss": 0.5813, + "step": 1264 + }, + { + "epoch": 10.811965811965813, + "grad_norm": 3.67153000831604, + "learning_rate": 4.594017094017094e-06, + "loss": 0.4175, + "step": 1265 + }, + { + "epoch": 10.820512820512821, + "grad_norm": 9.109044075012207, + "learning_rate": 4.58974358974359e-06, + "loss": 0.4505, + "step": 1266 + }, + { + "epoch": 10.82905982905983, + "grad_norm": 5.419683933258057, + "learning_rate": 4.585470085470086e-06, + "loss": 0.2316, + "step": 1267 + }, + { + "epoch": 10.837606837606838, + "grad_norm": 2.901182174682617, + "learning_rate": 4.581196581196582e-06, + "loss": 0.0583, + "step": 1268 + }, + { + "epoch": 10.846153846153847, + "grad_norm": 4.579897403717041, + "learning_rate": 4.5769230769230775e-06, + "loss": 0.0536, + "step": 1269 + }, + { + "epoch": 10.854700854700855, + "grad_norm": 4.232446670532227, + "learning_rate": 4.5726495726495725e-06, + "loss": 0.17, + "step": 1270 + }, + { + "epoch": 10.863247863247864, + "grad_norm": 8.059329986572266, + "learning_rate": 4.568376068376068e-06, + "loss": 0.256, + "step": 1271 + }, + { + "epoch": 10.871794871794872, + "grad_norm": 1.5736984014511108, + "learning_rate": 4.564102564102564e-06, + "loss": 0.058, + "step": 1272 + }, + { + "epoch": 10.88034188034188, + "grad_norm": 5.397885799407959, + "learning_rate": 4.55982905982906e-06, + "loss": 0.1299, + "step": 1273 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 3.9831533432006836, + "learning_rate": 4.555555555555556e-06, + "loss": 0.1762, + "step": 1274 + }, + { + "epoch": 10.897435897435898, + "grad_norm": 2.170370101928711, + "learning_rate": 4.551282051282052e-06, + "loss": 0.1355, + "step": 1275 + }, + { + "epoch": 10.905982905982906, + "grad_norm": 5.151463508605957, + "learning_rate": 4.547008547008547e-06, + "loss": 0.3151, + "step": 1276 + }, + { + "epoch": 10.914529914529915, + "grad_norm": 2.215559482574463, + "learning_rate": 4.542735042735043e-06, + "loss": 0.1054, + "step": 1277 + }, + { + "epoch": 10.923076923076923, + "grad_norm": 3.62188458442688, + "learning_rate": 4.538461538461539e-06, + "loss": 0.3839, + "step": 1278 + }, + { + "epoch": 10.931623931623932, + "grad_norm": 1.8855514526367188, + "learning_rate": 4.534188034188035e-06, + "loss": 0.0639, + "step": 1279 + }, + { + "epoch": 10.94017094017094, + "grad_norm": 3.0260651111602783, + "learning_rate": 4.5299145299145306e-06, + "loss": 0.1216, + "step": 1280 + }, + { + "epoch": 10.948717948717949, + "grad_norm": 13.30820083618164, + "learning_rate": 4.525641025641026e-06, + "loss": 0.3337, + "step": 1281 + }, + { + "epoch": 10.957264957264957, + "grad_norm": 4.356720447540283, + "learning_rate": 4.5213675213675215e-06, + "loss": 0.2692, + "step": 1282 + }, + { + "epoch": 10.965811965811966, + "grad_norm": 2.077742576599121, + "learning_rate": 4.517094017094017e-06, + "loss": 0.1181, + "step": 1283 + }, + { + "epoch": 10.974358974358974, + "grad_norm": 6.6224284172058105, + "learning_rate": 4.512820512820513e-06, + "loss": 0.1526, + "step": 1284 + }, + { + "epoch": 10.982905982905983, + "grad_norm": 4.072678565979004, + "learning_rate": 4.508547008547009e-06, + "loss": 0.1804, + "step": 1285 + }, + { + "epoch": 10.991452991452991, + "grad_norm": 3.430922269821167, + "learning_rate": 4.504273504273505e-06, + "loss": 0.1316, + "step": 1286 + }, + { + "epoch": 11.0, + "grad_norm": 1.6371959447860718, + "learning_rate": 4.5e-06, + "loss": 0.0596, + "step": 1287 + }, + { + "epoch": 11.0, + "eval_loss": 0.08654214441776276, + "eval_runtime": 9.3013, + "eval_samples_per_second": 50.1, + "eval_steps_per_second": 6.343, + "step": 1287 + }, + { + "epoch": 11.008547008547009, + "grad_norm": 5.072701454162598, + "learning_rate": 4.495726495726496e-06, + "loss": 0.2195, + "step": 1288 + }, + { + "epoch": 11.017094017094017, + "grad_norm": 6.791895389556885, + "learning_rate": 4.491452991452992e-06, + "loss": 0.5354, + "step": 1289 + }, + { + "epoch": 11.025641025641026, + "grad_norm": 12.475218772888184, + "learning_rate": 4.487179487179488e-06, + "loss": 0.1828, + "step": 1290 + }, + { + "epoch": 11.034188034188034, + "grad_norm": 5.892624855041504, + "learning_rate": 4.482905982905984e-06, + "loss": 0.1617, + "step": 1291 + }, + { + "epoch": 11.042735042735043, + "grad_norm": 1.742074728012085, + "learning_rate": 4.4786324786324796e-06, + "loss": 0.0508, + "step": 1292 + }, + { + "epoch": 11.051282051282051, + "grad_norm": 2.389373302459717, + "learning_rate": 4.474358974358975e-06, + "loss": 0.1009, + "step": 1293 + }, + { + "epoch": 11.05982905982906, + "grad_norm": 3.7152106761932373, + "learning_rate": 4.4700854700854705e-06, + "loss": 0.2157, + "step": 1294 + }, + { + "epoch": 11.068376068376068, + "grad_norm": 7.217955112457275, + "learning_rate": 4.465811965811966e-06, + "loss": 0.2737, + "step": 1295 + }, + { + "epoch": 11.076923076923077, + "grad_norm": 2.0971977710723877, + "learning_rate": 4.461538461538462e-06, + "loss": 0.1273, + "step": 1296 + }, + { + "epoch": 11.085470085470085, + "grad_norm": 1.1616859436035156, + "learning_rate": 4.457264957264958e-06, + "loss": 0.0325, + "step": 1297 + }, + { + "epoch": 11.094017094017094, + "grad_norm": 3.4287424087524414, + "learning_rate": 4.452991452991453e-06, + "loss": 0.1136, + "step": 1298 + }, + { + "epoch": 11.102564102564102, + "grad_norm": 1.6207005977630615, + "learning_rate": 4.448717948717949e-06, + "loss": 0.0344, + "step": 1299 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 3.009976863861084, + "learning_rate": 4.444444444444444e-06, + "loss": 0.1532, + "step": 1300 + }, + { + "epoch": 11.11965811965812, + "grad_norm": 2.9768505096435547, + "learning_rate": 4.44017094017094e-06, + "loss": 0.0874, + "step": 1301 + }, + { + "epoch": 11.128205128205128, + "grad_norm": 3.622715473175049, + "learning_rate": 4.435897435897436e-06, + "loss": 0.3132, + "step": 1302 + }, + { + "epoch": 11.136752136752136, + "grad_norm": 3.5741326808929443, + "learning_rate": 4.431623931623932e-06, + "loss": 0.0914, + "step": 1303 + }, + { + "epoch": 11.145299145299145, + "grad_norm": 7.436197280883789, + "learning_rate": 4.427350427350428e-06, + "loss": 0.329, + "step": 1304 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 2.390066146850586, + "learning_rate": 4.423076923076924e-06, + "loss": 0.0867, + "step": 1305 + }, + { + "epoch": 11.162393162393162, + "grad_norm": 1.928227424621582, + "learning_rate": 4.418803418803419e-06, + "loss": 0.0294, + "step": 1306 + }, + { + "epoch": 11.17094017094017, + "grad_norm": 4.40464448928833, + "learning_rate": 4.4145299145299145e-06, + "loss": 0.3704, + "step": 1307 + }, + { + "epoch": 11.179487179487179, + "grad_norm": 22.183835983276367, + "learning_rate": 4.4102564102564104e-06, + "loss": 0.6011, + "step": 1308 + }, + { + "epoch": 11.188034188034187, + "grad_norm": 2.496633768081665, + "learning_rate": 4.405982905982906e-06, + "loss": 0.0494, + "step": 1309 + }, + { + "epoch": 11.196581196581196, + "grad_norm": 1.142687201499939, + "learning_rate": 4.401709401709402e-06, + "loss": 0.0292, + "step": 1310 + }, + { + "epoch": 11.205128205128204, + "grad_norm": 2.0762455463409424, + "learning_rate": 4.397435897435898e-06, + "loss": 0.1123, + "step": 1311 + }, + { + "epoch": 11.213675213675213, + "grad_norm": 1.5389565229415894, + "learning_rate": 4.393162393162393e-06, + "loss": 0.0316, + "step": 1312 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 4.252040386199951, + "learning_rate": 4.388888888888889e-06, + "loss": 0.0832, + "step": 1313 + }, + { + "epoch": 11.23076923076923, + "grad_norm": 2.1999545097351074, + "learning_rate": 4.384615384615385e-06, + "loss": 0.1121, + "step": 1314 + }, + { + "epoch": 11.239316239316238, + "grad_norm": 3.3256099224090576, + "learning_rate": 4.380341880341881e-06, + "loss": 0.1288, + "step": 1315 + }, + { + "epoch": 11.247863247863247, + "grad_norm": 2.6664986610412598, + "learning_rate": 4.376068376068377e-06, + "loss": 0.1044, + "step": 1316 + }, + { + "epoch": 11.256410256410255, + "grad_norm": 4.103114604949951, + "learning_rate": 4.371794871794872e-06, + "loss": 0.3115, + "step": 1317 + }, + { + "epoch": 11.264957264957266, + "grad_norm": 2.717532157897949, + "learning_rate": 4.367521367521368e-06, + "loss": 0.1144, + "step": 1318 + }, + { + "epoch": 11.273504273504274, + "grad_norm": 2.7918317317962646, + "learning_rate": 4.3632478632478635e-06, + "loss": 0.1205, + "step": 1319 + }, + { + "epoch": 11.282051282051283, + "grad_norm": 2.439854383468628, + "learning_rate": 4.358974358974359e-06, + "loss": 0.05, + "step": 1320 + }, + { + "epoch": 11.290598290598291, + "grad_norm": 1.3528865575790405, + "learning_rate": 4.354700854700855e-06, + "loss": 0.0437, + "step": 1321 + }, + { + "epoch": 11.2991452991453, + "grad_norm": 3.3273401260375977, + "learning_rate": 4.350427350427351e-06, + "loss": 0.1417, + "step": 1322 + }, + { + "epoch": 11.307692307692308, + "grad_norm": 4.022815704345703, + "learning_rate": 4.346153846153846e-06, + "loss": 0.0845, + "step": 1323 + }, + { + "epoch": 11.316239316239317, + "grad_norm": 5.169338703155518, + "learning_rate": 4.341880341880342e-06, + "loss": 0.5235, + "step": 1324 + }, + { + "epoch": 11.324786324786325, + "grad_norm": 1.8199687004089355, + "learning_rate": 4.337606837606838e-06, + "loss": 0.0399, + "step": 1325 + }, + { + "epoch": 11.333333333333334, + "grad_norm": 3.3616087436676025, + "learning_rate": 4.333333333333334e-06, + "loss": 0.1428, + "step": 1326 + }, + { + "epoch": 11.341880341880342, + "grad_norm": 14.056232452392578, + "learning_rate": 4.32905982905983e-06, + "loss": 0.2921, + "step": 1327 + }, + { + "epoch": 11.350427350427351, + "grad_norm": 2.3905317783355713, + "learning_rate": 4.324786324786326e-06, + "loss": 0.0478, + "step": 1328 + }, + { + "epoch": 11.35897435897436, + "grad_norm": 9.876815795898438, + "learning_rate": 4.320512820512821e-06, + "loss": 0.1926, + "step": 1329 + }, + { + "epoch": 11.367521367521368, + "grad_norm": 1.3726049661636353, + "learning_rate": 4.316239316239317e-06, + "loss": 0.0416, + "step": 1330 + }, + { + "epoch": 11.376068376068377, + "grad_norm": 3.0890841484069824, + "learning_rate": 4.3119658119658125e-06, + "loss": 0.0614, + "step": 1331 + }, + { + "epoch": 11.384615384615385, + "grad_norm": 2.858560562133789, + "learning_rate": 4.307692307692308e-06, + "loss": 0.2068, + "step": 1332 + }, + { + "epoch": 11.393162393162394, + "grad_norm": 4.6819658279418945, + "learning_rate": 4.303418803418804e-06, + "loss": 0.5773, + "step": 1333 + }, + { + "epoch": 11.401709401709402, + "grad_norm": 1.741450548171997, + "learning_rate": 4.299145299145299e-06, + "loss": 0.0505, + "step": 1334 + }, + { + "epoch": 11.41025641025641, + "grad_norm": 3.5882327556610107, + "learning_rate": 4.294871794871795e-06, + "loss": 0.1797, + "step": 1335 + }, + { + "epoch": 11.418803418803419, + "grad_norm": 3.59714937210083, + "learning_rate": 4.29059829059829e-06, + "loss": 0.1531, + "step": 1336 + }, + { + "epoch": 11.427350427350428, + "grad_norm": 3.619572877883911, + "learning_rate": 4.286324786324786e-06, + "loss": 0.1028, + "step": 1337 + }, + { + "epoch": 11.435897435897436, + "grad_norm": 3.9230782985687256, + "learning_rate": 4.282051282051282e-06, + "loss": 0.2404, + "step": 1338 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 3.6987717151641846, + "learning_rate": 4.277777777777778e-06, + "loss": 0.1795, + "step": 1339 + }, + { + "epoch": 11.452991452991453, + "grad_norm": 3.322707176208496, + "learning_rate": 4.273504273504274e-06, + "loss": 0.0968, + "step": 1340 + }, + { + "epoch": 11.461538461538462, + "grad_norm": 1.2378501892089844, + "learning_rate": 4.26923076923077e-06, + "loss": 0.0387, + "step": 1341 + }, + { + "epoch": 11.47008547008547, + "grad_norm": 2.6801578998565674, + "learning_rate": 4.264957264957265e-06, + "loss": 0.0475, + "step": 1342 + }, + { + "epoch": 11.478632478632479, + "grad_norm": 2.2003352642059326, + "learning_rate": 4.260683760683761e-06, + "loss": 0.0505, + "step": 1343 + }, + { + "epoch": 11.487179487179487, + "grad_norm": 1.701341152191162, + "learning_rate": 4.2564102564102566e-06, + "loss": 0.064, + "step": 1344 + }, + { + "epoch": 11.495726495726496, + "grad_norm": 9.939803123474121, + "learning_rate": 4.2521367521367524e-06, + "loss": 0.461, + "step": 1345 + }, + { + "epoch": 11.504273504273504, + "grad_norm": 3.2999305725097656, + "learning_rate": 4.247863247863248e-06, + "loss": 0.1653, + "step": 1346 + }, + { + "epoch": 11.512820512820513, + "grad_norm": 3.9968252182006836, + "learning_rate": 4.243589743589744e-06, + "loss": 0.123, + "step": 1347 + }, + { + "epoch": 11.521367521367521, + "grad_norm": 2.846968173980713, + "learning_rate": 4.239316239316239e-06, + "loss": 0.1161, + "step": 1348 + }, + { + "epoch": 11.52991452991453, + "grad_norm": 4.328092575073242, + "learning_rate": 4.235042735042735e-06, + "loss": 0.065, + "step": 1349 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 3.649003267288208, + "learning_rate": 4.230769230769231e-06, + "loss": 0.1919, + "step": 1350 + }, + { + "epoch": 11.547008547008547, + "grad_norm": 4.094634056091309, + "learning_rate": 4.226495726495727e-06, + "loss": 0.1728, + "step": 1351 + }, + { + "epoch": 11.555555555555555, + "grad_norm": 2.3904240131378174, + "learning_rate": 4.222222222222223e-06, + "loss": 0.105, + "step": 1352 + }, + { + "epoch": 11.564102564102564, + "grad_norm": 1.8493746519088745, + "learning_rate": 4.217948717948718e-06, + "loss": 0.0373, + "step": 1353 + }, + { + "epoch": 11.572649572649572, + "grad_norm": 4.690928936004639, + "learning_rate": 4.213675213675214e-06, + "loss": 0.3405, + "step": 1354 + }, + { + "epoch": 11.581196581196581, + "grad_norm": 6.808948516845703, + "learning_rate": 4.20940170940171e-06, + "loss": 0.1308, + "step": 1355 + }, + { + "epoch": 11.58974358974359, + "grad_norm": 6.060946464538574, + "learning_rate": 4.2051282051282055e-06, + "loss": 0.1494, + "step": 1356 + }, + { + "epoch": 11.598290598290598, + "grad_norm": 1.5923279523849487, + "learning_rate": 4.2008547008547014e-06, + "loss": 0.044, + "step": 1357 + }, + { + "epoch": 11.606837606837606, + "grad_norm": 1.7796354293823242, + "learning_rate": 4.196581196581197e-06, + "loss": 0.0558, + "step": 1358 + }, + { + "epoch": 11.615384615384615, + "grad_norm": 1.2209490537643433, + "learning_rate": 4.192307692307692e-06, + "loss": 0.0492, + "step": 1359 + }, + { + "epoch": 11.623931623931623, + "grad_norm": 4.0859880447387695, + "learning_rate": 4.188034188034188e-06, + "loss": 0.0759, + "step": 1360 + }, + { + "epoch": 11.632478632478632, + "grad_norm": 3.5021755695343018, + "learning_rate": 4.183760683760684e-06, + "loss": 0.1263, + "step": 1361 + }, + { + "epoch": 11.64102564102564, + "grad_norm": 2.5915517807006836, + "learning_rate": 4.17948717948718e-06, + "loss": 0.1949, + "step": 1362 + }, + { + "epoch": 11.649572649572649, + "grad_norm": 2.8024656772613525, + "learning_rate": 4.175213675213676e-06, + "loss": 0.2325, + "step": 1363 + }, + { + "epoch": 11.658119658119658, + "grad_norm": 5.795172691345215, + "learning_rate": 4.170940170940172e-06, + "loss": 0.3253, + "step": 1364 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 5.056031227111816, + "learning_rate": 4.166666666666667e-06, + "loss": 0.102, + "step": 1365 + }, + { + "epoch": 11.675213675213675, + "grad_norm": 6.092950820922852, + "learning_rate": 4.162393162393163e-06, + "loss": 0.1938, + "step": 1366 + }, + { + "epoch": 11.683760683760683, + "grad_norm": 4.44755744934082, + "learning_rate": 4.158119658119659e-06, + "loss": 0.1588, + "step": 1367 + }, + { + "epoch": 11.692307692307692, + "grad_norm": 171.19509887695312, + "learning_rate": 4.1538461538461545e-06, + "loss": 0.3077, + "step": 1368 + }, + { + "epoch": 11.7008547008547, + "grad_norm": 13.992602348327637, + "learning_rate": 4.1495726495726504e-06, + "loss": 0.4401, + "step": 1369 + }, + { + "epoch": 11.709401709401709, + "grad_norm": 2.2174923419952393, + "learning_rate": 4.145299145299146e-06, + "loss": 0.1751, + "step": 1370 + }, + { + "epoch": 11.717948717948717, + "grad_norm": 2.031663179397583, + "learning_rate": 4.141025641025641e-06, + "loss": 0.049, + "step": 1371 + }, + { + "epoch": 11.726495726495726, + "grad_norm": 4.201449394226074, + "learning_rate": 4.136752136752136e-06, + "loss": 0.1016, + "step": 1372 + }, + { + "epoch": 11.735042735042736, + "grad_norm": 3.953226089477539, + "learning_rate": 4.132478632478632e-06, + "loss": 0.1336, + "step": 1373 + }, + { + "epoch": 11.743589743589745, + "grad_norm": 1.4856081008911133, + "learning_rate": 4.128205128205128e-06, + "loss": 0.0537, + "step": 1374 + }, + { + "epoch": 11.752136752136753, + "grad_norm": 1.2989288568496704, + "learning_rate": 4.123931623931624e-06, + "loss": 0.0351, + "step": 1375 + }, + { + "epoch": 11.760683760683762, + "grad_norm": 4.335974216461182, + "learning_rate": 4.11965811965812e-06, + "loss": 0.0722, + "step": 1376 + }, + { + "epoch": 11.76923076923077, + "grad_norm": 6.298306941986084, + "learning_rate": 4.115384615384616e-06, + "loss": 0.2359, + "step": 1377 + }, + { + "epoch": 11.777777777777779, + "grad_norm": 0.7119566798210144, + "learning_rate": 4.111111111111111e-06, + "loss": 0.0192, + "step": 1378 + }, + { + "epoch": 11.786324786324787, + "grad_norm": 2.7993624210357666, + "learning_rate": 4.106837606837607e-06, + "loss": 0.0605, + "step": 1379 + }, + { + "epoch": 11.794871794871796, + "grad_norm": 6.566782474517822, + "learning_rate": 4.102564102564103e-06, + "loss": 0.3883, + "step": 1380 + }, + { + "epoch": 11.803418803418804, + "grad_norm": 8.177830696105957, + "learning_rate": 4.0982905982905986e-06, + "loss": 0.257, + "step": 1381 + }, + { + "epoch": 11.811965811965813, + "grad_norm": 4.04230260848999, + "learning_rate": 4.0940170940170945e-06, + "loss": 0.0943, + "step": 1382 + }, + { + "epoch": 11.820512820512821, + "grad_norm": 3.595386505126953, + "learning_rate": 4.08974358974359e-06, + "loss": 0.0533, + "step": 1383 + }, + { + "epoch": 11.82905982905983, + "grad_norm": 3.755312204360962, + "learning_rate": 4.085470085470085e-06, + "loss": 0.0468, + "step": 1384 + }, + { + "epoch": 11.837606837606838, + "grad_norm": 2.0697362422943115, + "learning_rate": 4.081196581196581e-06, + "loss": 0.063, + "step": 1385 + }, + { + "epoch": 11.846153846153847, + "grad_norm": 7.690021991729736, + "learning_rate": 4.076923076923077e-06, + "loss": 0.2415, + "step": 1386 + }, + { + "epoch": 11.854700854700855, + "grad_norm": 3.0239031314849854, + "learning_rate": 4.072649572649573e-06, + "loss": 0.1257, + "step": 1387 + }, + { + "epoch": 11.863247863247864, + "grad_norm": 2.263847589492798, + "learning_rate": 4.068376068376069e-06, + "loss": 0.132, + "step": 1388 + }, + { + "epoch": 11.871794871794872, + "grad_norm": 2.9513261318206787, + "learning_rate": 4.064102564102565e-06, + "loss": 0.1229, + "step": 1389 + }, + { + "epoch": 11.88034188034188, + "grad_norm": 3.03973388671875, + "learning_rate": 4.05982905982906e-06, + "loss": 0.0966, + "step": 1390 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 1.0075026750564575, + "learning_rate": 4.055555555555556e-06, + "loss": 0.0284, + "step": 1391 + }, + { + "epoch": 11.897435897435898, + "grad_norm": 1.5330802202224731, + "learning_rate": 4.051282051282052e-06, + "loss": 0.0614, + "step": 1392 + }, + { + "epoch": 11.905982905982906, + "grad_norm": 3.6498589515686035, + "learning_rate": 4.0470085470085476e-06, + "loss": 0.2236, + "step": 1393 + }, + { + "epoch": 11.914529914529915, + "grad_norm": 4.659658908843994, + "learning_rate": 4.0427350427350435e-06, + "loss": 0.3245, + "step": 1394 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 3.921703815460205, + "learning_rate": 4.0384615384615385e-06, + "loss": 0.2981, + "step": 1395 + }, + { + "epoch": 11.931623931623932, + "grad_norm": 5.816749572753906, + "learning_rate": 4.034188034188034e-06, + "loss": 0.1606, + "step": 1396 + }, + { + "epoch": 11.94017094017094, + "grad_norm": 1.2831742763519287, + "learning_rate": 4.02991452991453e-06, + "loss": 0.0307, + "step": 1397 + }, + { + "epoch": 11.948717948717949, + "grad_norm": 5.745227813720703, + "learning_rate": 4.025641025641026e-06, + "loss": 0.5323, + "step": 1398 + }, + { + "epoch": 11.957264957264957, + "grad_norm": 2.4196462631225586, + "learning_rate": 4.021367521367522e-06, + "loss": 0.09, + "step": 1399 + }, + { + "epoch": 11.965811965811966, + "grad_norm": 8.084505081176758, + "learning_rate": 4.017094017094018e-06, + "loss": 0.2991, + "step": 1400 + }, + { + "epoch": 11.974358974358974, + "grad_norm": 3.786708116531372, + "learning_rate": 4.012820512820513e-06, + "loss": 0.2163, + "step": 1401 + }, + { + "epoch": 11.982905982905983, + "grad_norm": 4.76535701751709, + "learning_rate": 4.008547008547009e-06, + "loss": 0.2453, + "step": 1402 + }, + { + "epoch": 11.991452991452991, + "grad_norm": 7.380269527435303, + "learning_rate": 4.004273504273505e-06, + "loss": 0.3525, + "step": 1403 + }, + { + "epoch": 12.0, + "grad_norm": 41.21335983276367, + "learning_rate": 4.000000000000001e-06, + "loss": 0.2139, + "step": 1404 + }, + { + "epoch": 12.0, + "eval_loss": 0.07730000466108322, + "eval_runtime": 9.2426, + "eval_samples_per_second": 50.419, + "eval_steps_per_second": 6.383, + "step": 1404 + }, + { + "epoch": 12.008547008547009, + "grad_norm": 2.3692574501037598, + "learning_rate": 3.9957264957264966e-06, + "loss": 0.0939, + "step": 1405 + }, + { + "epoch": 12.017094017094017, + "grad_norm": 8.087658882141113, + "learning_rate": 3.9914529914529924e-06, + "loss": 0.2801, + "step": 1406 + }, + { + "epoch": 12.025641025641026, + "grad_norm": 8.448614120483398, + "learning_rate": 3.9871794871794875e-06, + "loss": 0.2069, + "step": 1407 + }, + { + "epoch": 12.034188034188034, + "grad_norm": 1.8581651449203491, + "learning_rate": 3.982905982905983e-06, + "loss": 0.0509, + "step": 1408 + }, + { + "epoch": 12.042735042735043, + "grad_norm": 1.711654543876648, + "learning_rate": 3.9786324786324784e-06, + "loss": 0.0464, + "step": 1409 + }, + { + "epoch": 12.051282051282051, + "grad_norm": 1.482553482055664, + "learning_rate": 3.974358974358974e-06, + "loss": 0.028, + "step": 1410 + }, + { + "epoch": 12.05982905982906, + "grad_norm": 8.005542755126953, + "learning_rate": 3.97008547008547e-06, + "loss": 0.2587, + "step": 1411 + }, + { + "epoch": 12.068376068376068, + "grad_norm": 2.1153948307037354, + "learning_rate": 3.965811965811966e-06, + "loss": 0.0563, + "step": 1412 + }, + { + "epoch": 12.076923076923077, + "grad_norm": 7.791186809539795, + "learning_rate": 3.961538461538462e-06, + "loss": 0.0587, + "step": 1413 + }, + { + "epoch": 12.085470085470085, + "grad_norm": 21.04537582397461, + "learning_rate": 3.957264957264957e-06, + "loss": 0.252, + "step": 1414 + }, + { + "epoch": 12.094017094017094, + "grad_norm": 3.144742727279663, + "learning_rate": 3.952991452991453e-06, + "loss": 0.2207, + "step": 1415 + }, + { + "epoch": 12.102564102564102, + "grad_norm": 2.23223614692688, + "learning_rate": 3.948717948717949e-06, + "loss": 0.0923, + "step": 1416 + }, + { + "epoch": 12.11111111111111, + "grad_norm": 3.5652217864990234, + "learning_rate": 3.944444444444445e-06, + "loss": 0.2197, + "step": 1417 + }, + { + "epoch": 12.11965811965812, + "grad_norm": 3.1105499267578125, + "learning_rate": 3.940170940170941e-06, + "loss": 0.071, + "step": 1418 + }, + { + "epoch": 12.128205128205128, + "grad_norm": 2.525405168533325, + "learning_rate": 3.9358974358974365e-06, + "loss": 0.0874, + "step": 1419 + }, + { + "epoch": 12.136752136752136, + "grad_norm": 4.479174613952637, + "learning_rate": 3.9316239316239315e-06, + "loss": 0.1872, + "step": 1420 + }, + { + "epoch": 12.145299145299145, + "grad_norm": 2.0484113693237305, + "learning_rate": 3.927350427350427e-06, + "loss": 0.0739, + "step": 1421 + }, + { + "epoch": 12.153846153846153, + "grad_norm": 2.014679431915283, + "learning_rate": 3.923076923076923e-06, + "loss": 0.1089, + "step": 1422 + }, + { + "epoch": 12.162393162393162, + "grad_norm": 4.71014404296875, + "learning_rate": 3.918803418803419e-06, + "loss": 0.3136, + "step": 1423 + }, + { + "epoch": 12.17094017094017, + "grad_norm": 2.1372437477111816, + "learning_rate": 3.914529914529915e-06, + "loss": 0.0458, + "step": 1424 + }, + { + "epoch": 12.179487179487179, + "grad_norm": 1.4595564603805542, + "learning_rate": 3.910256410256411e-06, + "loss": 0.0601, + "step": 1425 + }, + { + "epoch": 12.188034188034187, + "grad_norm": 4.45602560043335, + "learning_rate": 3.905982905982906e-06, + "loss": 0.091, + "step": 1426 + }, + { + "epoch": 12.196581196581196, + "grad_norm": 1.473585844039917, + "learning_rate": 3.901709401709402e-06, + "loss": 0.0515, + "step": 1427 + }, + { + "epoch": 12.205128205128204, + "grad_norm": 1.8761534690856934, + "learning_rate": 3.897435897435898e-06, + "loss": 0.055, + "step": 1428 + }, + { + "epoch": 12.213675213675213, + "grad_norm": 0.7121579647064209, + "learning_rate": 3.893162393162394e-06, + "loss": 0.0197, + "step": 1429 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 2.0035219192504883, + "learning_rate": 3.88888888888889e-06, + "loss": 0.0904, + "step": 1430 + }, + { + "epoch": 12.23076923076923, + "grad_norm": 3.820181369781494, + "learning_rate": 3.884615384615385e-06, + "loss": 0.2415, + "step": 1431 + }, + { + "epoch": 12.239316239316238, + "grad_norm": 3.40633225440979, + "learning_rate": 3.8803418803418805e-06, + "loss": 0.0593, + "step": 1432 + }, + { + "epoch": 12.247863247863247, + "grad_norm": 7.093897342681885, + "learning_rate": 3.876068376068376e-06, + "loss": 0.2504, + "step": 1433 + }, + { + "epoch": 12.256410256410255, + "grad_norm": 2.1057517528533936, + "learning_rate": 3.871794871794872e-06, + "loss": 0.0573, + "step": 1434 + }, + { + "epoch": 12.264957264957266, + "grad_norm": 4.797401428222656, + "learning_rate": 3.867521367521368e-06, + "loss": 0.338, + "step": 1435 + }, + { + "epoch": 12.273504273504274, + "grad_norm": 20.711339950561523, + "learning_rate": 3.863247863247864e-06, + "loss": 0.1964, + "step": 1436 + }, + { + "epoch": 12.282051282051283, + "grad_norm": 2.725280523300171, + "learning_rate": 3.858974358974359e-06, + "loss": 0.1837, + "step": 1437 + }, + { + "epoch": 12.290598290598291, + "grad_norm": 0.9469479322433472, + "learning_rate": 3.854700854700855e-06, + "loss": 0.0231, + "step": 1438 + }, + { + "epoch": 12.2991452991453, + "grad_norm": 2.0424935817718506, + "learning_rate": 3.850427350427351e-06, + "loss": 0.1373, + "step": 1439 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 1.4781558513641357, + "learning_rate": 3.846153846153847e-06, + "loss": 0.0393, + "step": 1440 + }, + { + "epoch": 12.316239316239317, + "grad_norm": 3.7576427459716797, + "learning_rate": 3.841880341880343e-06, + "loss": 0.1134, + "step": 1441 + }, + { + "epoch": 12.324786324786325, + "grad_norm": 299.5986633300781, + "learning_rate": 3.8376068376068386e-06, + "loss": 0.8017, + "step": 1442 + }, + { + "epoch": 12.333333333333334, + "grad_norm": 3.109199047088623, + "learning_rate": 3.833333333333334e-06, + "loss": 0.1014, + "step": 1443 + }, + { + "epoch": 12.341880341880342, + "grad_norm": 6.353960990905762, + "learning_rate": 3.8290598290598295e-06, + "loss": 0.3484, + "step": 1444 + }, + { + "epoch": 12.350427350427351, + "grad_norm": 12.957517623901367, + "learning_rate": 3.8247863247863246e-06, + "loss": 0.5644, + "step": 1445 + }, + { + "epoch": 12.35897435897436, + "grad_norm": 10.197676658630371, + "learning_rate": 3.8205128205128204e-06, + "loss": 0.1525, + "step": 1446 + }, + { + "epoch": 12.367521367521368, + "grad_norm": 1.7754546403884888, + "learning_rate": 3.816239316239316e-06, + "loss": 0.0259, + "step": 1447 + }, + { + "epoch": 12.376068376068377, + "grad_norm": 1.4237226247787476, + "learning_rate": 3.8119658119658122e-06, + "loss": 0.0307, + "step": 1448 + }, + { + "epoch": 12.384615384615385, + "grad_norm": 2.94474458694458, + "learning_rate": 3.8076923076923077e-06, + "loss": 0.1447, + "step": 1449 + }, + { + "epoch": 12.393162393162394, + "grad_norm": 3.7823615074157715, + "learning_rate": 3.8034188034188036e-06, + "loss": 0.071, + "step": 1450 + }, + { + "epoch": 12.401709401709402, + "grad_norm": 7.5281081199646, + "learning_rate": 3.7991452991452995e-06, + "loss": 0.1805, + "step": 1451 + }, + { + "epoch": 12.41025641025641, + "grad_norm": 2.523592233657837, + "learning_rate": 3.794871794871795e-06, + "loss": 0.0684, + "step": 1452 + }, + { + "epoch": 12.418803418803419, + "grad_norm": 2.423443078994751, + "learning_rate": 3.790598290598291e-06, + "loss": 0.0726, + "step": 1453 + }, + { + "epoch": 12.427350427350428, + "grad_norm": 6.3336005210876465, + "learning_rate": 3.7863247863247863e-06, + "loss": 0.1684, + "step": 1454 + }, + { + "epoch": 12.435897435897436, + "grad_norm": 248.31146240234375, + "learning_rate": 3.782051282051282e-06, + "loss": 0.6863, + "step": 1455 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 3.0117695331573486, + "learning_rate": 3.777777777777778e-06, + "loss": 0.217, + "step": 1456 + }, + { + "epoch": 12.452991452991453, + "grad_norm": 1.4753539562225342, + "learning_rate": 3.7735042735042735e-06, + "loss": 0.0623, + "step": 1457 + }, + { + "epoch": 12.461538461538462, + "grad_norm": 2.095745325088501, + "learning_rate": 3.7692307692307694e-06, + "loss": 0.055, + "step": 1458 + }, + { + "epoch": 12.47008547008547, + "grad_norm": 3.508305788040161, + "learning_rate": 3.7649572649572653e-06, + "loss": 0.1097, + "step": 1459 + }, + { + "epoch": 12.478632478632479, + "grad_norm": 3.0965282917022705, + "learning_rate": 3.760683760683761e-06, + "loss": 0.3374, + "step": 1460 + }, + { + "epoch": 12.487179487179487, + "grad_norm": 0.7286785244941711, + "learning_rate": 3.7564102564102567e-06, + "loss": 0.0182, + "step": 1461 + }, + { + "epoch": 12.495726495726496, + "grad_norm": 5.957888126373291, + "learning_rate": 3.7521367521367526e-06, + "loss": 0.3498, + "step": 1462 + }, + { + "epoch": 12.504273504273504, + "grad_norm": 10.433263778686523, + "learning_rate": 3.747863247863248e-06, + "loss": 0.446, + "step": 1463 + }, + { + "epoch": 12.512820512820513, + "grad_norm": 4.565568923950195, + "learning_rate": 3.743589743589744e-06, + "loss": 0.1026, + "step": 1464 + }, + { + "epoch": 12.521367521367521, + "grad_norm": 2.607106924057007, + "learning_rate": 3.73931623931624e-06, + "loss": 0.0912, + "step": 1465 + }, + { + "epoch": 12.52991452991453, + "grad_norm": 2.415541410446167, + "learning_rate": 3.7350427350427353e-06, + "loss": 0.0594, + "step": 1466 + }, + { + "epoch": 12.538461538461538, + "grad_norm": 7.978870868682861, + "learning_rate": 3.730769230769231e-06, + "loss": 0.2617, + "step": 1467 + }, + { + "epoch": 12.547008547008547, + "grad_norm": 6.858293056488037, + "learning_rate": 3.726495726495727e-06, + "loss": 0.3642, + "step": 1468 + }, + { + "epoch": 12.555555555555555, + "grad_norm": 1.3900551795959473, + "learning_rate": 3.7222222222222225e-06, + "loss": 0.0445, + "step": 1469 + }, + { + "epoch": 12.564102564102564, + "grad_norm": 8.111970901489258, + "learning_rate": 3.7179487179487184e-06, + "loss": 0.1828, + "step": 1470 + }, + { + "epoch": 12.572649572649572, + "grad_norm": 2.731841802597046, + "learning_rate": 3.7136752136752143e-06, + "loss": 0.2027, + "step": 1471 + }, + { + "epoch": 12.581196581196581, + "grad_norm": 4.418527126312256, + "learning_rate": 3.7094017094017098e-06, + "loss": 0.1744, + "step": 1472 + }, + { + "epoch": 12.58974358974359, + "grad_norm": 2.8263015747070312, + "learning_rate": 3.7051282051282057e-06, + "loss": 0.1123, + "step": 1473 + }, + { + "epoch": 12.598290598290598, + "grad_norm": 2.3524725437164307, + "learning_rate": 3.700854700854701e-06, + "loss": 0.0999, + "step": 1474 + }, + { + "epoch": 12.606837606837606, + "grad_norm": 9.863709449768066, + "learning_rate": 3.696581196581197e-06, + "loss": 0.4589, + "step": 1475 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 3.5506396293640137, + "learning_rate": 3.692307692307693e-06, + "loss": 0.2034, + "step": 1476 + }, + { + "epoch": 12.623931623931623, + "grad_norm": 2.4352779388427734, + "learning_rate": 3.6880341880341884e-06, + "loss": 0.0806, + "step": 1477 + }, + { + "epoch": 12.632478632478632, + "grad_norm": 1.8339797258377075, + "learning_rate": 3.6837606837606843e-06, + "loss": 0.0635, + "step": 1478 + }, + { + "epoch": 12.64102564102564, + "grad_norm": 4.63474178314209, + "learning_rate": 3.67948717948718e-06, + "loss": 0.4568, + "step": 1479 + }, + { + "epoch": 12.649572649572649, + "grad_norm": 7.696872711181641, + "learning_rate": 3.6752136752136756e-06, + "loss": 0.1769, + "step": 1480 + }, + { + "epoch": 12.658119658119658, + "grad_norm": 1.3894271850585938, + "learning_rate": 3.670940170940171e-06, + "loss": 0.0747, + "step": 1481 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 5.607828140258789, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.1178, + "step": 1482 + }, + { + "epoch": 12.675213675213675, + "grad_norm": 2.120594024658203, + "learning_rate": 3.6623931623931625e-06, + "loss": 0.0497, + "step": 1483 + }, + { + "epoch": 12.683760683760683, + "grad_norm": 1.359381914138794, + "learning_rate": 3.6581196581196584e-06, + "loss": 0.035, + "step": 1484 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 2.8533923625946045, + "learning_rate": 3.653846153846154e-06, + "loss": 0.1048, + "step": 1485 + }, + { + "epoch": 12.7008547008547, + "grad_norm": 6.021198749542236, + "learning_rate": 3.6495726495726497e-06, + "loss": 0.1604, + "step": 1486 + }, + { + "epoch": 12.709401709401709, + "grad_norm": 7.198216915130615, + "learning_rate": 3.6452991452991456e-06, + "loss": 0.1656, + "step": 1487 + }, + { + "epoch": 12.717948717948717, + "grad_norm": 1.4581981897354126, + "learning_rate": 3.641025641025641e-06, + "loss": 0.0398, + "step": 1488 + }, + { + "epoch": 12.726495726495726, + "grad_norm": 30.704627990722656, + "learning_rate": 3.636752136752137e-06, + "loss": 0.3371, + "step": 1489 + }, + { + "epoch": 12.735042735042736, + "grad_norm": 2.5204057693481445, + "learning_rate": 3.632478632478633e-06, + "loss": 0.0742, + "step": 1490 + }, + { + "epoch": 12.743589743589745, + "grad_norm": 2.3917508125305176, + "learning_rate": 3.6282051282051283e-06, + "loss": 0.1681, + "step": 1491 + }, + { + "epoch": 12.752136752136753, + "grad_norm": 1.4529337882995605, + "learning_rate": 3.623931623931624e-06, + "loss": 0.0247, + "step": 1492 + }, + { + "epoch": 12.760683760683762, + "grad_norm": 31.894805908203125, + "learning_rate": 3.6196581196581197e-06, + "loss": 0.2222, + "step": 1493 + }, + { + "epoch": 12.76923076923077, + "grad_norm": 3.4240164756774902, + "learning_rate": 3.6153846153846156e-06, + "loss": 0.1432, + "step": 1494 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 2.0000102519989014, + "learning_rate": 3.6111111111111115e-06, + "loss": 0.0383, + "step": 1495 + }, + { + "epoch": 12.786324786324787, + "grad_norm": 3.7665908336639404, + "learning_rate": 3.606837606837607e-06, + "loss": 0.2719, + "step": 1496 + }, + { + "epoch": 12.794871794871796, + "grad_norm": 2.0319290161132812, + "learning_rate": 3.602564102564103e-06, + "loss": 0.0741, + "step": 1497 + }, + { + "epoch": 12.803418803418804, + "grad_norm": 2.3379619121551514, + "learning_rate": 3.5982905982905987e-06, + "loss": 0.1155, + "step": 1498 + }, + { + "epoch": 12.811965811965813, + "grad_norm": 5.183985233306885, + "learning_rate": 3.594017094017094e-06, + "loss": 0.0815, + "step": 1499 + }, + { + "epoch": 12.820512820512821, + "grad_norm": 3.1432502269744873, + "learning_rate": 3.58974358974359e-06, + "loss": 0.1855, + "step": 1500 + }, + { + "epoch": 12.82905982905983, + "grad_norm": 4.5739946365356445, + "learning_rate": 3.585470085470086e-06, + "loss": 0.1424, + "step": 1501 + }, + { + "epoch": 12.837606837606838, + "grad_norm": 1.6006520986557007, + "learning_rate": 3.5811965811965814e-06, + "loss": 0.0305, + "step": 1502 + }, + { + "epoch": 12.846153846153847, + "grad_norm": 3.937011241912842, + "learning_rate": 3.5769230769230773e-06, + "loss": 0.2497, + "step": 1503 + }, + { + "epoch": 12.854700854700855, + "grad_norm": 2.6159651279449463, + "learning_rate": 3.572649572649573e-06, + "loss": 0.1067, + "step": 1504 + }, + { + "epoch": 12.863247863247864, + "grad_norm": 2.578547239303589, + "learning_rate": 3.5683760683760687e-06, + "loss": 0.0663, + "step": 1505 + }, + { + "epoch": 12.871794871794872, + "grad_norm": 2.3777639865875244, + "learning_rate": 3.5641025641025646e-06, + "loss": 0.0558, + "step": 1506 + }, + { + "epoch": 12.88034188034188, + "grad_norm": 7.5656561851501465, + "learning_rate": 3.5598290598290604e-06, + "loss": 0.2448, + "step": 1507 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 4.21798849105835, + "learning_rate": 3.555555555555556e-06, + "loss": 0.1916, + "step": 1508 + }, + { + "epoch": 12.897435897435898, + "grad_norm": 1.318049669265747, + "learning_rate": 3.551282051282052e-06, + "loss": 0.0387, + "step": 1509 + }, + { + "epoch": 12.905982905982906, + "grad_norm": 2.4345362186431885, + "learning_rate": 3.5470085470085473e-06, + "loss": 0.061, + "step": 1510 + }, + { + "epoch": 12.914529914529915, + "grad_norm": 3.2767112255096436, + "learning_rate": 3.542735042735043e-06, + "loss": 0.1627, + "step": 1511 + }, + { + "epoch": 12.923076923076923, + "grad_norm": 6.881056785583496, + "learning_rate": 3.538461538461539e-06, + "loss": 0.2452, + "step": 1512 + }, + { + "epoch": 12.931623931623932, + "grad_norm": 8.017362594604492, + "learning_rate": 3.5341880341880345e-06, + "loss": 0.1972, + "step": 1513 + }, + { + "epoch": 12.94017094017094, + "grad_norm": 1.1411398649215698, + "learning_rate": 3.5299145299145304e-06, + "loss": 0.0243, + "step": 1514 + }, + { + "epoch": 12.948717948717949, + "grad_norm": 4.486563205718994, + "learning_rate": 3.5256410256410263e-06, + "loss": 0.1347, + "step": 1515 + }, + { + "epoch": 12.957264957264957, + "grad_norm": 2.348222494125366, + "learning_rate": 3.5213675213675218e-06, + "loss": 0.1828, + "step": 1516 + }, + { + "epoch": 12.965811965811966, + "grad_norm": 2.2855775356292725, + "learning_rate": 3.5170940170940177e-06, + "loss": 0.0465, + "step": 1517 + }, + { + "epoch": 12.974358974358974, + "grad_norm": 10.313456535339355, + "learning_rate": 3.5128205128205127e-06, + "loss": 0.3033, + "step": 1518 + }, + { + "epoch": 12.982905982905983, + "grad_norm": 12.115890502929688, + "learning_rate": 3.5085470085470086e-06, + "loss": 0.6762, + "step": 1519 + }, + { + "epoch": 12.991452991452991, + "grad_norm": 2.746267557144165, + "learning_rate": 3.5042735042735045e-06, + "loss": 0.123, + "step": 1520 + }, + { + "epoch": 13.0, + "grad_norm": 5.204991340637207, + "learning_rate": 3.5e-06, + "loss": 0.2086, + "step": 1521 + }, + { + "epoch": 13.0, + "eval_loss": 0.06878729909658432, + "eval_runtime": 9.2334, + "eval_samples_per_second": 50.469, + "eval_steps_per_second": 6.39, + "step": 1521 + }, + { + "epoch": 13.008547008547009, + "grad_norm": 1.8741862773895264, + "learning_rate": 3.495726495726496e-06, + "loss": 0.0594, + "step": 1522 + }, + { + "epoch": 13.017094017094017, + "grad_norm": 1.6060154438018799, + "learning_rate": 3.4914529914529917e-06, + "loss": 0.0426, + "step": 1523 + }, + { + "epoch": 13.025641025641026, + "grad_norm": 2.194714069366455, + "learning_rate": 3.487179487179487e-06, + "loss": 0.1907, + "step": 1524 + }, + { + "epoch": 13.034188034188034, + "grad_norm": 0.716149628162384, + "learning_rate": 3.482905982905983e-06, + "loss": 0.0177, + "step": 1525 + }, + { + "epoch": 13.042735042735043, + "grad_norm": 4.787989139556885, + "learning_rate": 3.478632478632479e-06, + "loss": 0.246, + "step": 1526 + }, + { + "epoch": 13.051282051282051, + "grad_norm": 1.662338137626648, + "learning_rate": 3.4743589743589744e-06, + "loss": 0.0561, + "step": 1527 + }, + { + "epoch": 13.05982905982906, + "grad_norm": 0.9663236737251282, + "learning_rate": 3.4700854700854703e-06, + "loss": 0.0392, + "step": 1528 + }, + { + "epoch": 13.068376068376068, + "grad_norm": 0.8232766389846802, + "learning_rate": 3.465811965811966e-06, + "loss": 0.0221, + "step": 1529 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 2.434157609939575, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.1777, + "step": 1530 + }, + { + "epoch": 13.085470085470085, + "grad_norm": 2.768070936203003, + "learning_rate": 3.4572649572649576e-06, + "loss": 0.1101, + "step": 1531 + }, + { + "epoch": 13.094017094017094, + "grad_norm": 2.061371088027954, + "learning_rate": 3.452991452991453e-06, + "loss": 0.0591, + "step": 1532 + }, + { + "epoch": 13.102564102564102, + "grad_norm": 1.6127598285675049, + "learning_rate": 3.448717948717949e-06, + "loss": 0.3858, + "step": 1533 + }, + { + "epoch": 13.11111111111111, + "grad_norm": 1.2561885118484497, + "learning_rate": 3.444444444444445e-06, + "loss": 0.0315, + "step": 1534 + }, + { + "epoch": 13.11965811965812, + "grad_norm": 2.2859408855438232, + "learning_rate": 3.4401709401709403e-06, + "loss": 0.047, + "step": 1535 + }, + { + "epoch": 13.128205128205128, + "grad_norm": 3.7528388500213623, + "learning_rate": 3.435897435897436e-06, + "loss": 0.1069, + "step": 1536 + }, + { + "epoch": 13.136752136752136, + "grad_norm": 5.547614574432373, + "learning_rate": 3.431623931623932e-06, + "loss": 0.1411, + "step": 1537 + }, + { + "epoch": 13.145299145299145, + "grad_norm": 1.6566565036773682, + "learning_rate": 3.4273504273504275e-06, + "loss": 0.0266, + "step": 1538 + }, + { + "epoch": 13.153846153846153, + "grad_norm": 5.280163288116455, + "learning_rate": 3.4230769230769234e-06, + "loss": 0.0843, + "step": 1539 + }, + { + "epoch": 13.162393162393162, + "grad_norm": 6.624744892120361, + "learning_rate": 3.4188034188034193e-06, + "loss": 0.1652, + "step": 1540 + }, + { + "epoch": 13.17094017094017, + "grad_norm": 5.325616359710693, + "learning_rate": 3.414529914529915e-06, + "loss": 0.077, + "step": 1541 + }, + { + "epoch": 13.179487179487179, + "grad_norm": 11.31779956817627, + "learning_rate": 3.4102564102564107e-06, + "loss": 0.4377, + "step": 1542 + }, + { + "epoch": 13.188034188034187, + "grad_norm": 4.86885404586792, + "learning_rate": 3.4059829059829066e-06, + "loss": 0.2312, + "step": 1543 + }, + { + "epoch": 13.196581196581196, + "grad_norm": 1.779068112373352, + "learning_rate": 3.401709401709402e-06, + "loss": 0.032, + "step": 1544 + }, + { + "epoch": 13.205128205128204, + "grad_norm": 1.9934108257293701, + "learning_rate": 3.397435897435898e-06, + "loss": 0.0861, + "step": 1545 + }, + { + "epoch": 13.213675213675213, + "grad_norm": 2.1829612255096436, + "learning_rate": 3.3931623931623934e-06, + "loss": 0.0855, + "step": 1546 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 31.108810424804688, + "learning_rate": 3.3888888888888893e-06, + "loss": 0.334, + "step": 1547 + }, + { + "epoch": 13.23076923076923, + "grad_norm": 4.867705345153809, + "learning_rate": 3.384615384615385e-06, + "loss": 0.0808, + "step": 1548 + }, + { + "epoch": 13.239316239316238, + "grad_norm": 3.226783275604248, + "learning_rate": 3.3803418803418806e-06, + "loss": 0.1806, + "step": 1549 + }, + { + "epoch": 13.247863247863247, + "grad_norm": 1.4822824001312256, + "learning_rate": 3.3760683760683765e-06, + "loss": 0.0602, + "step": 1550 + }, + { + "epoch": 13.256410256410255, + "grad_norm": 4.529379844665527, + "learning_rate": 3.3717948717948724e-06, + "loss": 0.318, + "step": 1551 + }, + { + "epoch": 13.264957264957266, + "grad_norm": 3.2155706882476807, + "learning_rate": 3.367521367521368e-06, + "loss": 0.1006, + "step": 1552 + }, + { + "epoch": 13.273504273504274, + "grad_norm": 2.2805707454681396, + "learning_rate": 3.3632478632478638e-06, + "loss": 0.0774, + "step": 1553 + }, + { + "epoch": 13.282051282051283, + "grad_norm": 11.477370262145996, + "learning_rate": 3.358974358974359e-06, + "loss": 0.8342, + "step": 1554 + }, + { + "epoch": 13.290598290598291, + "grad_norm": 3.8596534729003906, + "learning_rate": 3.3547008547008547e-06, + "loss": 0.1924, + "step": 1555 + }, + { + "epoch": 13.2991452991453, + "grad_norm": 4.497336387634277, + "learning_rate": 3.3504273504273506e-06, + "loss": 0.2425, + "step": 1556 + }, + { + "epoch": 13.307692307692308, + "grad_norm": 1.4496978521347046, + "learning_rate": 3.346153846153846e-06, + "loss": 0.0168, + "step": 1557 + }, + { + "epoch": 13.316239316239317, + "grad_norm": 2.0277416706085205, + "learning_rate": 3.341880341880342e-06, + "loss": 0.0634, + "step": 1558 + }, + { + "epoch": 13.324786324786325, + "grad_norm": 2.9120066165924072, + "learning_rate": 3.337606837606838e-06, + "loss": 0.1153, + "step": 1559 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 4.949625015258789, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.1412, + "step": 1560 + }, + { + "epoch": 13.341880341880342, + "grad_norm": 5.970853805541992, + "learning_rate": 3.3290598290598292e-06, + "loss": 0.1607, + "step": 1561 + }, + { + "epoch": 13.350427350427351, + "grad_norm": 2.1988022327423096, + "learning_rate": 3.324786324786325e-06, + "loss": 0.0329, + "step": 1562 + }, + { + "epoch": 13.35897435897436, + "grad_norm": 2.3578758239746094, + "learning_rate": 3.3205128205128206e-06, + "loss": 0.0711, + "step": 1563 + }, + { + "epoch": 13.367521367521368, + "grad_norm": 4.554023742675781, + "learning_rate": 3.3162393162393165e-06, + "loss": 0.1929, + "step": 1564 + }, + { + "epoch": 13.376068376068377, + "grad_norm": 3.577073335647583, + "learning_rate": 3.311965811965812e-06, + "loss": 0.0969, + "step": 1565 + }, + { + "epoch": 13.384615384615385, + "grad_norm": 3.3863015174865723, + "learning_rate": 3.307692307692308e-06, + "loss": 0.2402, + "step": 1566 + }, + { + "epoch": 13.393162393162394, + "grad_norm": 1.044550895690918, + "learning_rate": 3.3034188034188037e-06, + "loss": 0.026, + "step": 1567 + }, + { + "epoch": 13.401709401709402, + "grad_norm": 3.1525843143463135, + "learning_rate": 3.299145299145299e-06, + "loss": 0.0619, + "step": 1568 + }, + { + "epoch": 13.41025641025641, + "grad_norm": 2.0380606651306152, + "learning_rate": 3.294871794871795e-06, + "loss": 0.0477, + "step": 1569 + }, + { + "epoch": 13.418803418803419, + "grad_norm": 2.4260973930358887, + "learning_rate": 3.290598290598291e-06, + "loss": 0.0709, + "step": 1570 + }, + { + "epoch": 13.427350427350428, + "grad_norm": 20.958803176879883, + "learning_rate": 3.2863247863247864e-06, + "loss": 0.2297, + "step": 1571 + }, + { + "epoch": 13.435897435897436, + "grad_norm": 2.847252368927002, + "learning_rate": 3.2820512820512823e-06, + "loss": 0.0565, + "step": 1572 + }, + { + "epoch": 13.444444444444445, + "grad_norm": 3.646381139755249, + "learning_rate": 3.277777777777778e-06, + "loss": 0.3043, + "step": 1573 + }, + { + "epoch": 13.452991452991453, + "grad_norm": 3.0526609420776367, + "learning_rate": 3.2735042735042737e-06, + "loss": 0.0941, + "step": 1574 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 1.6154388189315796, + "learning_rate": 3.2692307692307696e-06, + "loss": 0.0597, + "step": 1575 + }, + { + "epoch": 13.47008547008547, + "grad_norm": 1.0825392007827759, + "learning_rate": 3.2649572649572655e-06, + "loss": 0.0325, + "step": 1576 + }, + { + "epoch": 13.478632478632479, + "grad_norm": 6.045910358428955, + "learning_rate": 3.260683760683761e-06, + "loss": 0.2202, + "step": 1577 + }, + { + "epoch": 13.487179487179487, + "grad_norm": 3.0401153564453125, + "learning_rate": 3.256410256410257e-06, + "loss": 0.0923, + "step": 1578 + }, + { + "epoch": 13.495726495726496, + "grad_norm": 5.485551834106445, + "learning_rate": 3.2521367521367527e-06, + "loss": 0.3851, + "step": 1579 + }, + { + "epoch": 13.504273504273504, + "grad_norm": 2.575057029724121, + "learning_rate": 3.247863247863248e-06, + "loss": 0.0307, + "step": 1580 + }, + { + "epoch": 13.512820512820513, + "grad_norm": 2.7744545936584473, + "learning_rate": 3.243589743589744e-06, + "loss": 0.1791, + "step": 1581 + }, + { + "epoch": 13.521367521367521, + "grad_norm": 2.430640459060669, + "learning_rate": 3.2393162393162395e-06, + "loss": 0.1128, + "step": 1582 + }, + { + "epoch": 13.52991452991453, + "grad_norm": 4.902276992797852, + "learning_rate": 3.2350427350427354e-06, + "loss": 0.2661, + "step": 1583 + }, + { + "epoch": 13.538461538461538, + "grad_norm": 2.601134777069092, + "learning_rate": 3.2307692307692313e-06, + "loss": 0.1311, + "step": 1584 + }, + { + "epoch": 13.547008547008547, + "grad_norm": 6.309877395629883, + "learning_rate": 3.2264957264957268e-06, + "loss": 0.2621, + "step": 1585 + }, + { + "epoch": 13.555555555555555, + "grad_norm": 2.079618215560913, + "learning_rate": 3.2222222222222227e-06, + "loss": 0.0702, + "step": 1586 + }, + { + "epoch": 13.564102564102564, + "grad_norm": 2.309541702270508, + "learning_rate": 3.2179487179487186e-06, + "loss": 0.1577, + "step": 1587 + }, + { + "epoch": 13.572649572649572, + "grad_norm": 4.723629951477051, + "learning_rate": 3.213675213675214e-06, + "loss": 0.142, + "step": 1588 + }, + { + "epoch": 13.581196581196581, + "grad_norm": 2.557123899459839, + "learning_rate": 3.20940170940171e-06, + "loss": 0.1506, + "step": 1589 + }, + { + "epoch": 13.58974358974359, + "grad_norm": 2.3154499530792236, + "learning_rate": 3.205128205128206e-06, + "loss": 0.1039, + "step": 1590 + }, + { + "epoch": 13.598290598290598, + "grad_norm": 1.5464012622833252, + "learning_rate": 3.200854700854701e-06, + "loss": 0.0989, + "step": 1591 + }, + { + "epoch": 13.606837606837606, + "grad_norm": 1.5885653495788574, + "learning_rate": 3.1965811965811967e-06, + "loss": 0.0278, + "step": 1592 + }, + { + "epoch": 13.615384615384615, + "grad_norm": 2.7710390090942383, + "learning_rate": 3.192307692307692e-06, + "loss": 0.0521, + "step": 1593 + }, + { + "epoch": 13.623931623931623, + "grad_norm": 4.587305545806885, + "learning_rate": 3.188034188034188e-06, + "loss": 0.2609, + "step": 1594 + }, + { + "epoch": 13.632478632478632, + "grad_norm": 4.343963623046875, + "learning_rate": 3.183760683760684e-06, + "loss": 0.1079, + "step": 1595 + }, + { + "epoch": 13.64102564102564, + "grad_norm": 2.7653536796569824, + "learning_rate": 3.1794871794871795e-06, + "loss": 0.1293, + "step": 1596 + }, + { + "epoch": 13.649572649572649, + "grad_norm": 3.1731350421905518, + "learning_rate": 3.1752136752136753e-06, + "loss": 0.1279, + "step": 1597 + }, + { + "epoch": 13.658119658119658, + "grad_norm": 8.032745361328125, + "learning_rate": 3.1709401709401712e-06, + "loss": 0.2114, + "step": 1598 + }, + { + "epoch": 13.666666666666666, + "grad_norm": 5.6177263259887695, + "learning_rate": 3.1666666666666667e-06, + "loss": 0.0926, + "step": 1599 + }, + { + "epoch": 13.675213675213675, + "grad_norm": 3.3568480014801025, + "learning_rate": 3.1623931623931626e-06, + "loss": 0.1299, + "step": 1600 + }, + { + "epoch": 13.683760683760683, + "grad_norm": 5.182860374450684, + "learning_rate": 3.158119658119658e-06, + "loss": 0.1688, + "step": 1601 + }, + { + "epoch": 13.692307692307692, + "grad_norm": 5.954287052154541, + "learning_rate": 3.153846153846154e-06, + "loss": 0.2634, + "step": 1602 + }, + { + "epoch": 13.7008547008547, + "grad_norm": 2.8563358783721924, + "learning_rate": 3.14957264957265e-06, + "loss": 0.0469, + "step": 1603 + }, + { + "epoch": 13.709401709401709, + "grad_norm": 1.6049034595489502, + "learning_rate": 3.1452991452991453e-06, + "loss": 0.0855, + "step": 1604 + }, + { + "epoch": 13.717948717948717, + "grad_norm": 1.9734570980072021, + "learning_rate": 3.141025641025641e-06, + "loss": 0.0554, + "step": 1605 + }, + { + "epoch": 13.726495726495726, + "grad_norm": 1.8398605585098267, + "learning_rate": 3.136752136752137e-06, + "loss": 0.1033, + "step": 1606 + }, + { + "epoch": 13.735042735042736, + "grad_norm": 3.3013346195220947, + "learning_rate": 3.1324786324786326e-06, + "loss": 0.1476, + "step": 1607 + }, + { + "epoch": 13.743589743589745, + "grad_norm": 1.2622041702270508, + "learning_rate": 3.1282051282051284e-06, + "loss": 0.0222, + "step": 1608 + }, + { + "epoch": 13.752136752136753, + "grad_norm": 3.983888626098633, + "learning_rate": 3.1239316239316243e-06, + "loss": 0.0861, + "step": 1609 + }, + { + "epoch": 13.760683760683762, + "grad_norm": 2.883335828781128, + "learning_rate": 3.11965811965812e-06, + "loss": 0.0737, + "step": 1610 + }, + { + "epoch": 13.76923076923077, + "grad_norm": 0.9045059680938721, + "learning_rate": 3.1153846153846157e-06, + "loss": 0.0232, + "step": 1611 + }, + { + "epoch": 13.777777777777779, + "grad_norm": 1.8752232789993286, + "learning_rate": 3.1111111111111116e-06, + "loss": 0.0602, + "step": 1612 + }, + { + "epoch": 13.786324786324787, + "grad_norm": 3.088440418243408, + "learning_rate": 3.106837606837607e-06, + "loss": 0.102, + "step": 1613 + }, + { + "epoch": 13.794871794871796, + "grad_norm": 4.067224502563477, + "learning_rate": 3.102564102564103e-06, + "loss": 0.1461, + "step": 1614 + }, + { + "epoch": 13.803418803418804, + "grad_norm": 6.9123148918151855, + "learning_rate": 3.098290598290599e-06, + "loss": 0.0752, + "step": 1615 + }, + { + "epoch": 13.811965811965813, + "grad_norm": 17.15372657775879, + "learning_rate": 3.0940170940170943e-06, + "loss": 0.5163, + "step": 1616 + }, + { + "epoch": 13.820512820512821, + "grad_norm": 2.4951720237731934, + "learning_rate": 3.08974358974359e-06, + "loss": 0.1326, + "step": 1617 + }, + { + "epoch": 13.82905982905983, + "grad_norm": 2.1316449642181396, + "learning_rate": 3.0854700854700857e-06, + "loss": 0.0469, + "step": 1618 + }, + { + "epoch": 13.837606837606838, + "grad_norm": 2.5955941677093506, + "learning_rate": 3.0811965811965815e-06, + "loss": 0.1056, + "step": 1619 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 14.360347747802734, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.4793, + "step": 1620 + }, + { + "epoch": 13.854700854700855, + "grad_norm": 1.9134567975997925, + "learning_rate": 3.072649572649573e-06, + "loss": 0.054, + "step": 1621 + }, + { + "epoch": 13.863247863247864, + "grad_norm": 3.1168692111968994, + "learning_rate": 3.068376068376069e-06, + "loss": 0.321, + "step": 1622 + }, + { + "epoch": 13.871794871794872, + "grad_norm": 4.940008163452148, + "learning_rate": 3.0641025641025647e-06, + "loss": 0.1452, + "step": 1623 + }, + { + "epoch": 13.88034188034188, + "grad_norm": 3.001660108566284, + "learning_rate": 3.05982905982906e-06, + "loss": 0.1094, + "step": 1624 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 1.3110100030899048, + "learning_rate": 3.055555555555556e-06, + "loss": 0.0305, + "step": 1625 + }, + { + "epoch": 13.897435897435898, + "grad_norm": 269.3442077636719, + "learning_rate": 3.051282051282052e-06, + "loss": 0.8319, + "step": 1626 + }, + { + "epoch": 13.905982905982906, + "grad_norm": 1.5236955881118774, + "learning_rate": 3.0470085470085474e-06, + "loss": 0.0294, + "step": 1627 + }, + { + "epoch": 13.914529914529915, + "grad_norm": 1.8342583179473877, + "learning_rate": 3.042735042735043e-06, + "loss": 0.1122, + "step": 1628 + }, + { + "epoch": 13.923076923076923, + "grad_norm": 1.7902953624725342, + "learning_rate": 3.0384615384615383e-06, + "loss": 0.0426, + "step": 1629 + }, + { + "epoch": 13.931623931623932, + "grad_norm": 1.461769938468933, + "learning_rate": 3.0341880341880342e-06, + "loss": 0.0326, + "step": 1630 + }, + { + "epoch": 13.94017094017094, + "grad_norm": 2.2590038776397705, + "learning_rate": 3.02991452991453e-06, + "loss": 0.067, + "step": 1631 + }, + { + "epoch": 13.948717948717949, + "grad_norm": 0.8894402980804443, + "learning_rate": 3.0256410256410256e-06, + "loss": 0.0269, + "step": 1632 + }, + { + "epoch": 13.957264957264957, + "grad_norm": 2.097757339477539, + "learning_rate": 3.0213675213675215e-06, + "loss": 0.1211, + "step": 1633 + }, + { + "epoch": 13.965811965811966, + "grad_norm": 4.112930774688721, + "learning_rate": 3.0170940170940174e-06, + "loss": 0.1026, + "step": 1634 + }, + { + "epoch": 13.974358974358974, + "grad_norm": 4.55318021774292, + "learning_rate": 3.012820512820513e-06, + "loss": 0.2808, + "step": 1635 + }, + { + "epoch": 13.982905982905983, + "grad_norm": 2.1912014484405518, + "learning_rate": 3.0085470085470087e-06, + "loss": 0.0906, + "step": 1636 + }, + { + "epoch": 13.991452991452991, + "grad_norm": 4.612771511077881, + "learning_rate": 3.004273504273504e-06, + "loss": 0.17, + "step": 1637 + }, + { + "epoch": 14.0, + "grad_norm": 7.162411212921143, + "learning_rate": 3e-06, + "loss": 0.131, + "step": 1638 + }, + { + "epoch": 14.0, + "eval_loss": 0.06268326193094254, + "eval_runtime": 9.262, + "eval_samples_per_second": 50.313, + "eval_steps_per_second": 6.37, + "step": 1638 + }, + { + "epoch": 14.008547008547009, + "grad_norm": 4.41022253036499, + "learning_rate": 2.995726495726496e-06, + "loss": 0.1989, + "step": 1639 + }, + { + "epoch": 14.017094017094017, + "grad_norm": 2.2863216400146484, + "learning_rate": 2.9914529914529914e-06, + "loss": 0.0612, + "step": 1640 + }, + { + "epoch": 14.025641025641026, + "grad_norm": 1.5455230474472046, + "learning_rate": 2.9871794871794873e-06, + "loss": 0.0378, + "step": 1641 + }, + { + "epoch": 14.034188034188034, + "grad_norm": 0.9546025991439819, + "learning_rate": 2.9829059829059832e-06, + "loss": 0.0214, + "step": 1642 + }, + { + "epoch": 14.042735042735043, + "grad_norm": 5.546824932098389, + "learning_rate": 2.9786324786324787e-06, + "loss": 0.2502, + "step": 1643 + }, + { + "epoch": 14.051282051282051, + "grad_norm": 1.6261364221572876, + "learning_rate": 2.9743589743589746e-06, + "loss": 0.0271, + "step": 1644 + }, + { + "epoch": 14.05982905982906, + "grad_norm": 1.710256814956665, + "learning_rate": 2.9700854700854705e-06, + "loss": 0.0582, + "step": 1645 + }, + { + "epoch": 14.068376068376068, + "grad_norm": 1.2083494663238525, + "learning_rate": 2.965811965811966e-06, + "loss": 0.026, + "step": 1646 + }, + { + "epoch": 14.076923076923077, + "grad_norm": 3.6400561332702637, + "learning_rate": 2.961538461538462e-06, + "loss": 0.0896, + "step": 1647 + }, + { + "epoch": 14.085470085470085, + "grad_norm": 2.1084742546081543, + "learning_rate": 2.9572649572649577e-06, + "loss": 0.0269, + "step": 1648 + }, + { + "epoch": 14.094017094017094, + "grad_norm": 1.5661289691925049, + "learning_rate": 2.952991452991453e-06, + "loss": 0.0401, + "step": 1649 + }, + { + "epoch": 14.102564102564102, + "grad_norm": 23.358585357666016, + "learning_rate": 2.948717948717949e-06, + "loss": 0.2069, + "step": 1650 + }, + { + "epoch": 14.11111111111111, + "grad_norm": 9.171899795532227, + "learning_rate": 2.944444444444445e-06, + "loss": 0.2842, + "step": 1651 + }, + { + "epoch": 14.11965811965812, + "grad_norm": 1.3189946413040161, + "learning_rate": 2.9401709401709404e-06, + "loss": 0.0331, + "step": 1652 + }, + { + "epoch": 14.128205128205128, + "grad_norm": 3.6144192218780518, + "learning_rate": 2.9358974358974363e-06, + "loss": 0.2069, + "step": 1653 + }, + { + "epoch": 14.136752136752136, + "grad_norm": 2.764681577682495, + "learning_rate": 2.931623931623932e-06, + "loss": 0.0646, + "step": 1654 + }, + { + "epoch": 14.145299145299145, + "grad_norm": 2.073028564453125, + "learning_rate": 2.9273504273504277e-06, + "loss": 0.1223, + "step": 1655 + }, + { + "epoch": 14.153846153846153, + "grad_norm": 12.209549903869629, + "learning_rate": 2.9230769230769236e-06, + "loss": 0.1922, + "step": 1656 + }, + { + "epoch": 14.162393162393162, + "grad_norm": 3.1137638092041016, + "learning_rate": 2.918803418803419e-06, + "loss": 0.2586, + "step": 1657 + }, + { + "epoch": 14.17094017094017, + "grad_norm": 5.130307674407959, + "learning_rate": 2.914529914529915e-06, + "loss": 0.2695, + "step": 1658 + }, + { + "epoch": 14.179487179487179, + "grad_norm": 3.475097894668579, + "learning_rate": 2.910256410256411e-06, + "loss": 0.2131, + "step": 1659 + }, + { + "epoch": 14.188034188034187, + "grad_norm": 0.5851498246192932, + "learning_rate": 2.9059829059829063e-06, + "loss": 0.0167, + "step": 1660 + }, + { + "epoch": 14.196581196581196, + "grad_norm": 1.795509934425354, + "learning_rate": 2.901709401709402e-06, + "loss": 0.0857, + "step": 1661 + }, + { + "epoch": 14.205128205128204, + "grad_norm": 1.7123979330062866, + "learning_rate": 2.897435897435898e-06, + "loss": 0.0599, + "step": 1662 + }, + { + "epoch": 14.213675213675213, + "grad_norm": 1.230388879776001, + "learning_rate": 2.8931623931623935e-06, + "loss": 0.0255, + "step": 1663 + }, + { + "epoch": 14.222222222222221, + "grad_norm": 3.8747615814208984, + "learning_rate": 2.888888888888889e-06, + "loss": 0.1412, + "step": 1664 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 2.233584403991699, + "learning_rate": 2.8846153846153845e-06, + "loss": 0.068, + "step": 1665 + }, + { + "epoch": 14.239316239316238, + "grad_norm": 5.327254772186279, + "learning_rate": 2.8803418803418804e-06, + "loss": 0.2616, + "step": 1666 + }, + { + "epoch": 14.247863247863247, + "grad_norm": 6.126563549041748, + "learning_rate": 2.8760683760683762e-06, + "loss": 0.0931, + "step": 1667 + }, + { + "epoch": 14.256410256410255, + "grad_norm": 1.4305050373077393, + "learning_rate": 2.8717948717948717e-06, + "loss": 0.0221, + "step": 1668 + }, + { + "epoch": 14.264957264957266, + "grad_norm": 3.0924506187438965, + "learning_rate": 2.8675213675213676e-06, + "loss": 0.0417, + "step": 1669 + }, + { + "epoch": 14.273504273504274, + "grad_norm": 2.548558235168457, + "learning_rate": 2.8632478632478635e-06, + "loss": 0.0744, + "step": 1670 + }, + { + "epoch": 14.282051282051283, + "grad_norm": 0.46632057428359985, + "learning_rate": 2.858974358974359e-06, + "loss": 0.0114, + "step": 1671 + }, + { + "epoch": 14.290598290598291, + "grad_norm": 2.5199391841888428, + "learning_rate": 2.854700854700855e-06, + "loss": 0.0819, + "step": 1672 + }, + { + "epoch": 14.2991452991453, + "grad_norm": 1.849133014678955, + "learning_rate": 2.8504273504273507e-06, + "loss": 0.0424, + "step": 1673 + }, + { + "epoch": 14.307692307692308, + "grad_norm": 2.9396777153015137, + "learning_rate": 2.846153846153846e-06, + "loss": 0.0836, + "step": 1674 + }, + { + "epoch": 14.316239316239317, + "grad_norm": 0.7128950953483582, + "learning_rate": 2.841880341880342e-06, + "loss": 0.0181, + "step": 1675 + }, + { + "epoch": 14.324786324786325, + "grad_norm": 2.1387767791748047, + "learning_rate": 2.8376068376068376e-06, + "loss": 0.0432, + "step": 1676 + }, + { + "epoch": 14.333333333333334, + "grad_norm": 7.104556083679199, + "learning_rate": 2.8333333333333335e-06, + "loss": 0.1277, + "step": 1677 + }, + { + "epoch": 14.341880341880342, + "grad_norm": 3.718749761581421, + "learning_rate": 2.8290598290598293e-06, + "loss": 0.0738, + "step": 1678 + }, + { + "epoch": 14.350427350427351, + "grad_norm": 3.9387831687927246, + "learning_rate": 2.824786324786325e-06, + "loss": 0.1374, + "step": 1679 + }, + { + "epoch": 14.35897435897436, + "grad_norm": 2.1527843475341797, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.1426, + "step": 1680 + }, + { + "epoch": 14.367521367521368, + "grad_norm": 1.0589011907577515, + "learning_rate": 2.8162393162393166e-06, + "loss": 0.0343, + "step": 1681 + }, + { + "epoch": 14.376068376068377, + "grad_norm": 3.55014967918396, + "learning_rate": 2.811965811965812e-06, + "loss": 0.2962, + "step": 1682 + }, + { + "epoch": 14.384615384615385, + "grad_norm": 3.996713399887085, + "learning_rate": 2.807692307692308e-06, + "loss": 0.1458, + "step": 1683 + }, + { + "epoch": 14.393162393162394, + "grad_norm": 73.28384399414062, + "learning_rate": 2.803418803418804e-06, + "loss": 0.6138, + "step": 1684 + }, + { + "epoch": 14.401709401709402, + "grad_norm": 5.780628681182861, + "learning_rate": 2.7991452991452993e-06, + "loss": 0.2619, + "step": 1685 + }, + { + "epoch": 14.41025641025641, + "grad_norm": 3.2047317028045654, + "learning_rate": 2.794871794871795e-06, + "loss": 0.1917, + "step": 1686 + }, + { + "epoch": 14.418803418803419, + "grad_norm": 7.041647434234619, + "learning_rate": 2.790598290598291e-06, + "loss": 0.2136, + "step": 1687 + }, + { + "epoch": 14.427350427350428, + "grad_norm": 3.391404867172241, + "learning_rate": 2.7863247863247866e-06, + "loss": 0.094, + "step": 1688 + }, + { + "epoch": 14.435897435897436, + "grad_norm": 0.5430964231491089, + "learning_rate": 2.7820512820512824e-06, + "loss": 0.0139, + "step": 1689 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 5.696547985076904, + "learning_rate": 2.7777777777777783e-06, + "loss": 0.5808, + "step": 1690 + }, + { + "epoch": 14.452991452991453, + "grad_norm": 3.5785481929779053, + "learning_rate": 2.773504273504274e-06, + "loss": 0.219, + "step": 1691 + }, + { + "epoch": 14.461538461538462, + "grad_norm": 6.63624906539917, + "learning_rate": 2.7692307692307697e-06, + "loss": 0.2586, + "step": 1692 + }, + { + "epoch": 14.47008547008547, + "grad_norm": 16.79705810546875, + "learning_rate": 2.764957264957265e-06, + "loss": 0.1762, + "step": 1693 + }, + { + "epoch": 14.478632478632479, + "grad_norm": 4.069973468780518, + "learning_rate": 2.760683760683761e-06, + "loss": 0.1191, + "step": 1694 + }, + { + "epoch": 14.487179487179487, + "grad_norm": 1.1191340684890747, + "learning_rate": 2.756410256410257e-06, + "loss": 0.0529, + "step": 1695 + }, + { + "epoch": 14.495726495726496, + "grad_norm": 2.23835825920105, + "learning_rate": 2.7521367521367524e-06, + "loss": 0.0681, + "step": 1696 + }, + { + "epoch": 14.504273504273504, + "grad_norm": 2.745694160461426, + "learning_rate": 2.7478632478632483e-06, + "loss": 0.1885, + "step": 1697 + }, + { + "epoch": 14.512820512820513, + "grad_norm": 3.642946720123291, + "learning_rate": 2.743589743589744e-06, + "loss": 0.2061, + "step": 1698 + }, + { + "epoch": 14.521367521367521, + "grad_norm": 2.7571651935577393, + "learning_rate": 2.7393162393162397e-06, + "loss": 0.074, + "step": 1699 + }, + { + "epoch": 14.52991452991453, + "grad_norm": 0.889057457447052, + "learning_rate": 2.7350427350427355e-06, + "loss": 0.0342, + "step": 1700 + }, + { + "epoch": 14.538461538461538, + "grad_norm": 0.5471668243408203, + "learning_rate": 2.7307692307692306e-06, + "loss": 0.0125, + "step": 1701 + }, + { + "epoch": 14.547008547008547, + "grad_norm": 6.883024215698242, + "learning_rate": 2.7264957264957265e-06, + "loss": 0.4102, + "step": 1702 + }, + { + "epoch": 14.555555555555555, + "grad_norm": 2.6678171157836914, + "learning_rate": 2.7222222222222224e-06, + "loss": 0.0872, + "step": 1703 + }, + { + "epoch": 14.564102564102564, + "grad_norm": 5.825995445251465, + "learning_rate": 2.717948717948718e-06, + "loss": 0.1081, + "step": 1704 + }, + { + "epoch": 14.572649572649572, + "grad_norm": 1.5447179079055786, + "learning_rate": 2.7136752136752137e-06, + "loss": 0.0838, + "step": 1705 + }, + { + "epoch": 14.581196581196581, + "grad_norm": 17.58099937438965, + "learning_rate": 2.7094017094017096e-06, + "loss": 0.6379, + "step": 1706 + }, + { + "epoch": 14.58974358974359, + "grad_norm": 0.9537908434867859, + "learning_rate": 2.705128205128205e-06, + "loss": 0.0221, + "step": 1707 + }, + { + "epoch": 14.598290598290598, + "grad_norm": 3.264037847518921, + "learning_rate": 2.700854700854701e-06, + "loss": 0.1282, + "step": 1708 + }, + { + "epoch": 14.606837606837606, + "grad_norm": 1.7752703428268433, + "learning_rate": 2.696581196581197e-06, + "loss": 0.0194, + "step": 1709 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 4.8417649269104, + "learning_rate": 2.6923076923076923e-06, + "loss": 0.2217, + "step": 1710 + }, + { + "epoch": 14.623931623931623, + "grad_norm": 2.915694236755371, + "learning_rate": 2.6880341880341882e-06, + "loss": 0.1506, + "step": 1711 + }, + { + "epoch": 14.632478632478632, + "grad_norm": 10.983115196228027, + "learning_rate": 2.6837606837606837e-06, + "loss": 0.4307, + "step": 1712 + }, + { + "epoch": 14.64102564102564, + "grad_norm": 1.1121952533721924, + "learning_rate": 2.6794871794871796e-06, + "loss": 0.0211, + "step": 1713 + }, + { + "epoch": 14.649572649572649, + "grad_norm": 2.6676313877105713, + "learning_rate": 2.6752136752136755e-06, + "loss": 0.0997, + "step": 1714 + }, + { + "epoch": 14.658119658119658, + "grad_norm": 1.718767523765564, + "learning_rate": 2.670940170940171e-06, + "loss": 0.0533, + "step": 1715 + }, + { + "epoch": 14.666666666666666, + "grad_norm": 1.567866563796997, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0913, + "step": 1716 + }, + { + "epoch": 14.675213675213675, + "grad_norm": 3.0697431564331055, + "learning_rate": 2.6623931623931627e-06, + "loss": 0.1133, + "step": 1717 + }, + { + "epoch": 14.683760683760683, + "grad_norm": 2.2237489223480225, + "learning_rate": 2.658119658119658e-06, + "loss": 0.1091, + "step": 1718 + }, + { + "epoch": 14.692307692307692, + "grad_norm": 6.050041198730469, + "learning_rate": 2.653846153846154e-06, + "loss": 0.5622, + "step": 1719 + }, + { + "epoch": 14.7008547008547, + "grad_norm": 1.1796153783798218, + "learning_rate": 2.64957264957265e-06, + "loss": 0.0522, + "step": 1720 + }, + { + "epoch": 14.709401709401709, + "grad_norm": 2.4849863052368164, + "learning_rate": 2.6452991452991454e-06, + "loss": 0.0332, + "step": 1721 + }, + { + "epoch": 14.717948717948717, + "grad_norm": 1.771933674812317, + "learning_rate": 2.6410256410256413e-06, + "loss": 0.0692, + "step": 1722 + }, + { + "epoch": 14.726495726495726, + "grad_norm": 4.174441337585449, + "learning_rate": 2.6367521367521372e-06, + "loss": 0.1419, + "step": 1723 + }, + { + "epoch": 14.735042735042736, + "grad_norm": 4.145920276641846, + "learning_rate": 2.6324786324786327e-06, + "loss": 0.5196, + "step": 1724 + }, + { + "epoch": 14.743589743589745, + "grad_norm": 3.363537073135376, + "learning_rate": 2.6282051282051286e-06, + "loss": 0.1187, + "step": 1725 + }, + { + "epoch": 14.752136752136753, + "grad_norm": 1.9558751583099365, + "learning_rate": 2.6239316239316245e-06, + "loss": 0.0193, + "step": 1726 + }, + { + "epoch": 14.760683760683762, + "grad_norm": 2.8293466567993164, + "learning_rate": 2.61965811965812e-06, + "loss": 0.0551, + "step": 1727 + }, + { + "epoch": 14.76923076923077, + "grad_norm": 1.2654905319213867, + "learning_rate": 2.615384615384616e-06, + "loss": 0.0805, + "step": 1728 + }, + { + "epoch": 14.777777777777779, + "grad_norm": 0.9344054460525513, + "learning_rate": 2.6111111111111113e-06, + "loss": 0.0177, + "step": 1729 + }, + { + "epoch": 14.786324786324787, + "grad_norm": 1.268433690071106, + "learning_rate": 2.606837606837607e-06, + "loss": 0.0185, + "step": 1730 + }, + { + "epoch": 14.794871794871796, + "grad_norm": 2.5544192790985107, + "learning_rate": 2.602564102564103e-06, + "loss": 0.063, + "step": 1731 + }, + { + "epoch": 14.803418803418804, + "grad_norm": 2.1078386306762695, + "learning_rate": 2.5982905982905985e-06, + "loss": 0.1203, + "step": 1732 + }, + { + "epoch": 14.811965811965813, + "grad_norm": 1.526848554611206, + "learning_rate": 2.5940170940170944e-06, + "loss": 0.0524, + "step": 1733 + }, + { + "epoch": 14.820512820512821, + "grad_norm": 0.7479220628738403, + "learning_rate": 2.5897435897435903e-06, + "loss": 0.0197, + "step": 1734 + }, + { + "epoch": 14.82905982905983, + "grad_norm": 2.937556266784668, + "learning_rate": 2.5854700854700858e-06, + "loss": 0.1406, + "step": 1735 + }, + { + "epoch": 14.837606837606838, + "grad_norm": 2.3128576278686523, + "learning_rate": 2.5811965811965817e-06, + "loss": 0.056, + "step": 1736 + }, + { + "epoch": 14.846153846153847, + "grad_norm": 2.1093039512634277, + "learning_rate": 2.5769230769230767e-06, + "loss": 0.0645, + "step": 1737 + }, + { + "epoch": 14.854700854700855, + "grad_norm": 2.104214668273926, + "learning_rate": 2.5726495726495726e-06, + "loss": 0.1097, + "step": 1738 + }, + { + "epoch": 14.863247863247864, + "grad_norm": 3.781390428543091, + "learning_rate": 2.5683760683760685e-06, + "loss": 0.1214, + "step": 1739 + }, + { + "epoch": 14.871794871794872, + "grad_norm": 4.119661331176758, + "learning_rate": 2.564102564102564e-06, + "loss": 0.1797, + "step": 1740 + }, + { + "epoch": 14.88034188034188, + "grad_norm": 6.488205909729004, + "learning_rate": 2.55982905982906e-06, + "loss": 0.0679, + "step": 1741 + }, + { + "epoch": 14.88888888888889, + "grad_norm": 1.4211604595184326, + "learning_rate": 2.5555555555555557e-06, + "loss": 0.0375, + "step": 1742 + }, + { + "epoch": 14.897435897435898, + "grad_norm": 3.577533721923828, + "learning_rate": 2.5512820512820512e-06, + "loss": 0.1914, + "step": 1743 + }, + { + "epoch": 14.905982905982906, + "grad_norm": 8.697205543518066, + "learning_rate": 2.547008547008547e-06, + "loss": 0.5511, + "step": 1744 + }, + { + "epoch": 14.914529914529915, + "grad_norm": 0.49716269969940186, + "learning_rate": 2.542735042735043e-06, + "loss": 0.0125, + "step": 1745 + }, + { + "epoch": 14.923076923076923, + "grad_norm": 2.8563008308410645, + "learning_rate": 2.5384615384615385e-06, + "loss": 0.0901, + "step": 1746 + }, + { + "epoch": 14.931623931623932, + "grad_norm": 3.6407926082611084, + "learning_rate": 2.5341880341880344e-06, + "loss": 0.0718, + "step": 1747 + }, + { + "epoch": 14.94017094017094, + "grad_norm": 1.2601441144943237, + "learning_rate": 2.52991452991453e-06, + "loss": 0.0451, + "step": 1748 + }, + { + "epoch": 14.948717948717949, + "grad_norm": 2.4402401447296143, + "learning_rate": 2.5256410256410257e-06, + "loss": 0.0771, + "step": 1749 + }, + { + "epoch": 14.957264957264957, + "grad_norm": 0.6150484681129456, + "learning_rate": 2.5213675213675216e-06, + "loss": 0.0151, + "step": 1750 + }, + { + "epoch": 14.965811965811966, + "grad_norm": 3.6569836139678955, + "learning_rate": 2.517094017094017e-06, + "loss": 0.0905, + "step": 1751 + }, + { + "epoch": 14.974358974358974, + "grad_norm": 3.4421300888061523, + "learning_rate": 2.512820512820513e-06, + "loss": 0.0456, + "step": 1752 + }, + { + "epoch": 14.982905982905983, + "grad_norm": 3.565871477127075, + "learning_rate": 2.508547008547009e-06, + "loss": 0.0491, + "step": 1753 + }, + { + "epoch": 14.991452991452991, + "grad_norm": 37.519065856933594, + "learning_rate": 2.5042735042735043e-06, + "loss": 0.1348, + "step": 1754 + }, + { + "epoch": 15.0, + "grad_norm": 5.1902899742126465, + "learning_rate": 2.5e-06, + "loss": 0.1099, + "step": 1755 + }, + { + "epoch": 15.0, + "eval_loss": 0.05930963531136513, + "eval_runtime": 9.2206, + "eval_samples_per_second": 50.539, + "eval_steps_per_second": 6.399, + "step": 1755 + }, + { + "epoch": 15.008547008547009, + "grad_norm": 5.6569342613220215, + "learning_rate": 2.495726495726496e-06, + "loss": 0.1931, + "step": 1756 + }, + { + "epoch": 15.017094017094017, + "grad_norm": 5.23728084564209, + "learning_rate": 2.4914529914529916e-06, + "loss": 0.2789, + "step": 1757 + }, + { + "epoch": 15.025641025641026, + "grad_norm": 0.8648807406425476, + "learning_rate": 2.4871794871794875e-06, + "loss": 0.0227, + "step": 1758 + }, + { + "epoch": 15.034188034188034, + "grad_norm": 3.0654587745666504, + "learning_rate": 2.4829059829059833e-06, + "loss": 0.0602, + "step": 1759 + }, + { + "epoch": 15.042735042735043, + "grad_norm": 4.374608039855957, + "learning_rate": 2.478632478632479e-06, + "loss": 0.2133, + "step": 1760 + }, + { + "epoch": 15.051282051282051, + "grad_norm": 1.2764301300048828, + "learning_rate": 2.4743589743589747e-06, + "loss": 0.0296, + "step": 1761 + }, + { + "epoch": 15.05982905982906, + "grad_norm": 0.9672349095344543, + "learning_rate": 2.4700854700854706e-06, + "loss": 0.0224, + "step": 1762 + }, + { + "epoch": 15.068376068376068, + "grad_norm": 8.807465553283691, + "learning_rate": 2.465811965811966e-06, + "loss": 0.0925, + "step": 1763 + }, + { + "epoch": 15.076923076923077, + "grad_norm": 1.4733474254608154, + "learning_rate": 2.461538461538462e-06, + "loss": 0.0286, + "step": 1764 + }, + { + "epoch": 15.085470085470085, + "grad_norm": 6.014289855957031, + "learning_rate": 2.4572649572649574e-06, + "loss": 0.1387, + "step": 1765 + }, + { + "epoch": 15.094017094017094, + "grad_norm": 1.899086356163025, + "learning_rate": 2.452991452991453e-06, + "loss": 0.07, + "step": 1766 + }, + { + "epoch": 15.102564102564102, + "grad_norm": 11.32197380065918, + "learning_rate": 2.4487179487179488e-06, + "loss": 0.2452, + "step": 1767 + }, + { + "epoch": 15.11111111111111, + "grad_norm": 3.223996639251709, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.139, + "step": 1768 + }, + { + "epoch": 15.11965811965812, + "grad_norm": 2.8729913234710693, + "learning_rate": 2.44017094017094e-06, + "loss": 0.1386, + "step": 1769 + }, + { + "epoch": 15.128205128205128, + "grad_norm": 1.9730579853057861, + "learning_rate": 2.435897435897436e-06, + "loss": 0.0882, + "step": 1770 + }, + { + "epoch": 15.136752136752136, + "grad_norm": 5.556413650512695, + "learning_rate": 2.431623931623932e-06, + "loss": 0.1554, + "step": 1771 + }, + { + "epoch": 15.145299145299145, + "grad_norm": 1.2356898784637451, + "learning_rate": 2.4273504273504274e-06, + "loss": 0.0217, + "step": 1772 + }, + { + "epoch": 15.153846153846153, + "grad_norm": 7.849127769470215, + "learning_rate": 2.4230769230769233e-06, + "loss": 0.221, + "step": 1773 + }, + { + "epoch": 15.162393162393162, + "grad_norm": 0.5792569518089294, + "learning_rate": 2.418803418803419e-06, + "loss": 0.017, + "step": 1774 + }, + { + "epoch": 15.17094017094017, + "grad_norm": 2.2549376487731934, + "learning_rate": 2.4145299145299146e-06, + "loss": 0.0499, + "step": 1775 + }, + { + "epoch": 15.179487179487179, + "grad_norm": 2.722200870513916, + "learning_rate": 2.4102564102564105e-06, + "loss": 0.0408, + "step": 1776 + }, + { + "epoch": 15.188034188034187, + "grad_norm": 3.1140944957733154, + "learning_rate": 2.4059829059829064e-06, + "loss": 0.1001, + "step": 1777 + }, + { + "epoch": 15.196581196581196, + "grad_norm": 4.461791515350342, + "learning_rate": 2.401709401709402e-06, + "loss": 0.3419, + "step": 1778 + }, + { + "epoch": 15.205128205128204, + "grad_norm": 1.8562372922897339, + "learning_rate": 2.3974358974358978e-06, + "loss": 0.1092, + "step": 1779 + }, + { + "epoch": 15.213675213675213, + "grad_norm": 5.2086181640625, + "learning_rate": 2.3931623931623937e-06, + "loss": 0.1767, + "step": 1780 + }, + { + "epoch": 15.222222222222221, + "grad_norm": 1.6226582527160645, + "learning_rate": 2.388888888888889e-06, + "loss": 0.0347, + "step": 1781 + }, + { + "epoch": 15.23076923076923, + "grad_norm": 2.8507306575775146, + "learning_rate": 2.384615384615385e-06, + "loss": 0.0934, + "step": 1782 + }, + { + "epoch": 15.239316239316238, + "grad_norm": 2.74642276763916, + "learning_rate": 2.3803418803418805e-06, + "loss": 0.0857, + "step": 1783 + }, + { + "epoch": 15.247863247863247, + "grad_norm": 3.4352660179138184, + "learning_rate": 2.376068376068376e-06, + "loss": 0.2336, + "step": 1784 + }, + { + "epoch": 15.256410256410255, + "grad_norm": 3.4673473834991455, + "learning_rate": 2.371794871794872e-06, + "loss": 0.1974, + "step": 1785 + }, + { + "epoch": 15.264957264957266, + "grad_norm": 21.467744827270508, + "learning_rate": 2.3675213675213677e-06, + "loss": 0.6836, + "step": 1786 + }, + { + "epoch": 15.273504273504274, + "grad_norm": 2.832465887069702, + "learning_rate": 2.363247863247863e-06, + "loss": 0.245, + "step": 1787 + }, + { + "epoch": 15.282051282051283, + "grad_norm": 9.717825889587402, + "learning_rate": 2.358974358974359e-06, + "loss": 0.5324, + "step": 1788 + }, + { + "epoch": 15.290598290598291, + "grad_norm": 2.209528923034668, + "learning_rate": 2.354700854700855e-06, + "loss": 0.0854, + "step": 1789 + }, + { + "epoch": 15.2991452991453, + "grad_norm": 4.554971218109131, + "learning_rate": 2.3504273504273504e-06, + "loss": 0.1271, + "step": 1790 + }, + { + "epoch": 15.307692307692308, + "grad_norm": 3.1280457973480225, + "learning_rate": 2.3461538461538463e-06, + "loss": 0.1265, + "step": 1791 + }, + { + "epoch": 15.316239316239317, + "grad_norm": 2.647224187850952, + "learning_rate": 2.3418803418803422e-06, + "loss": 0.1965, + "step": 1792 + }, + { + "epoch": 15.324786324786325, + "grad_norm": 2.7695155143737793, + "learning_rate": 2.3376068376068377e-06, + "loss": 0.0528, + "step": 1793 + }, + { + "epoch": 15.333333333333334, + "grad_norm": 20.151025772094727, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.2011, + "step": 1794 + }, + { + "epoch": 15.341880341880342, + "grad_norm": 2.8718080520629883, + "learning_rate": 2.3290598290598295e-06, + "loss": 0.0502, + "step": 1795 + }, + { + "epoch": 15.350427350427351, + "grad_norm": 2.17462158203125, + "learning_rate": 2.324786324786325e-06, + "loss": 0.0658, + "step": 1796 + }, + { + "epoch": 15.35897435897436, + "grad_norm": 4.324810981750488, + "learning_rate": 2.320512820512821e-06, + "loss": 0.1429, + "step": 1797 + }, + { + "epoch": 15.367521367521368, + "grad_norm": 184.52798461914062, + "learning_rate": 2.3162393162393167e-06, + "loss": 0.5155, + "step": 1798 + }, + { + "epoch": 15.376068376068377, + "grad_norm": 2.6076488494873047, + "learning_rate": 2.311965811965812e-06, + "loss": 0.0708, + "step": 1799 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 3.0682790279388428, + "learning_rate": 2.307692307692308e-06, + "loss": 0.2662, + "step": 1800 + }, + { + "epoch": 15.393162393162394, + "grad_norm": 1.3366855382919312, + "learning_rate": 2.3034188034188035e-06, + "loss": 0.0136, + "step": 1801 + }, + { + "epoch": 15.401709401709402, + "grad_norm": 0.5489670634269714, + "learning_rate": 2.299145299145299e-06, + "loss": 0.0148, + "step": 1802 + }, + { + "epoch": 15.41025641025641, + "grad_norm": 1.080804705619812, + "learning_rate": 2.294871794871795e-06, + "loss": 0.025, + "step": 1803 + }, + { + "epoch": 15.418803418803419, + "grad_norm": 8.801629066467285, + "learning_rate": 2.290598290598291e-06, + "loss": 0.2038, + "step": 1804 + }, + { + "epoch": 15.427350427350428, + "grad_norm": 66.96419525146484, + "learning_rate": 2.2863247863247863e-06, + "loss": 0.4094, + "step": 1805 + }, + { + "epoch": 15.435897435897436, + "grad_norm": 1.3400782346725464, + "learning_rate": 2.282051282051282e-06, + "loss": 0.0452, + "step": 1806 + }, + { + "epoch": 15.444444444444445, + "grad_norm": 3.5850300788879395, + "learning_rate": 2.277777777777778e-06, + "loss": 0.0919, + "step": 1807 + }, + { + "epoch": 15.452991452991453, + "grad_norm": 8.670539855957031, + "learning_rate": 2.2735042735042735e-06, + "loss": 0.255, + "step": 1808 + }, + { + "epoch": 15.461538461538462, + "grad_norm": 3.609617233276367, + "learning_rate": 2.2692307692307694e-06, + "loss": 0.1203, + "step": 1809 + }, + { + "epoch": 15.47008547008547, + "grad_norm": 1.5857924222946167, + "learning_rate": 2.2649572649572653e-06, + "loss": 0.0371, + "step": 1810 + }, + { + "epoch": 15.478632478632479, + "grad_norm": 1.386805534362793, + "learning_rate": 2.2606837606837608e-06, + "loss": 0.0385, + "step": 1811 + }, + { + "epoch": 15.487179487179487, + "grad_norm": 4.130802631378174, + "learning_rate": 2.2564102564102566e-06, + "loss": 0.2261, + "step": 1812 + }, + { + "epoch": 15.495726495726496, + "grad_norm": 2.974247455596924, + "learning_rate": 2.2521367521367525e-06, + "loss": 0.0651, + "step": 1813 + }, + { + "epoch": 15.504273504273504, + "grad_norm": 1.2551554441452026, + "learning_rate": 2.247863247863248e-06, + "loss": 0.0229, + "step": 1814 + }, + { + "epoch": 15.512820512820513, + "grad_norm": 3.1401453018188477, + "learning_rate": 2.243589743589744e-06, + "loss": 0.0409, + "step": 1815 + }, + { + "epoch": 15.521367521367521, + "grad_norm": 1.3921948671340942, + "learning_rate": 2.2393162393162398e-06, + "loss": 0.0335, + "step": 1816 + }, + { + "epoch": 15.52991452991453, + "grad_norm": 5.457981586456299, + "learning_rate": 2.2350427350427353e-06, + "loss": 0.22, + "step": 1817 + }, + { + "epoch": 15.538461538461538, + "grad_norm": 0.9100427031517029, + "learning_rate": 2.230769230769231e-06, + "loss": 0.0217, + "step": 1818 + }, + { + "epoch": 15.547008547008547, + "grad_norm": 3.5890519618988037, + "learning_rate": 2.2264957264957266e-06, + "loss": 0.2241, + "step": 1819 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 2.965954303741455, + "learning_rate": 2.222222222222222e-06, + "loss": 0.1453, + "step": 1820 + }, + { + "epoch": 15.564102564102564, + "grad_norm": 8.436135292053223, + "learning_rate": 2.217948717948718e-06, + "loss": 0.2784, + "step": 1821 + }, + { + "epoch": 15.572649572649572, + "grad_norm": 2.043687582015991, + "learning_rate": 2.213675213675214e-06, + "loss": 0.0755, + "step": 1822 + }, + { + "epoch": 15.581196581196581, + "grad_norm": 2.380276918411255, + "learning_rate": 2.2094017094017093e-06, + "loss": 0.1867, + "step": 1823 + }, + { + "epoch": 15.58974358974359, + "grad_norm": 2.5189390182495117, + "learning_rate": 2.2051282051282052e-06, + "loss": 0.0619, + "step": 1824 + }, + { + "epoch": 15.598290598290598, + "grad_norm": 1.123610258102417, + "learning_rate": 2.200854700854701e-06, + "loss": 0.0286, + "step": 1825 + }, + { + "epoch": 15.606837606837606, + "grad_norm": 3.0018534660339355, + "learning_rate": 2.1965811965811966e-06, + "loss": 0.1449, + "step": 1826 + }, + { + "epoch": 15.615384615384615, + "grad_norm": 2.178926706314087, + "learning_rate": 2.1923076923076925e-06, + "loss": 0.0859, + "step": 1827 + }, + { + "epoch": 15.623931623931623, + "grad_norm": 5.799438953399658, + "learning_rate": 2.1880341880341884e-06, + "loss": 0.2669, + "step": 1828 + }, + { + "epoch": 15.632478632478632, + "grad_norm": 2.0338144302368164, + "learning_rate": 2.183760683760684e-06, + "loss": 0.0616, + "step": 1829 + }, + { + "epoch": 15.64102564102564, + "grad_norm": 3.789525032043457, + "learning_rate": 2.1794871794871797e-06, + "loss": 0.0439, + "step": 1830 + }, + { + "epoch": 15.649572649572649, + "grad_norm": 2.3695919513702393, + "learning_rate": 2.1752136752136756e-06, + "loss": 0.0979, + "step": 1831 + }, + { + "epoch": 15.658119658119658, + "grad_norm": 0.8543546795845032, + "learning_rate": 2.170940170940171e-06, + "loss": 0.0171, + "step": 1832 + }, + { + "epoch": 15.666666666666666, + "grad_norm": 3.7921054363250732, + "learning_rate": 2.166666666666667e-06, + "loss": 0.1094, + "step": 1833 + }, + { + "epoch": 15.675213675213675, + "grad_norm": 1.9967904090881348, + "learning_rate": 2.162393162393163e-06, + "loss": 0.0382, + "step": 1834 + }, + { + "epoch": 15.683760683760683, + "grad_norm": 2.5073959827423096, + "learning_rate": 2.1581196581196583e-06, + "loss": 0.0554, + "step": 1835 + }, + { + "epoch": 15.692307692307692, + "grad_norm": 1.2741888761520386, + "learning_rate": 2.153846153846154e-06, + "loss": 0.056, + "step": 1836 + }, + { + "epoch": 15.7008547008547, + "grad_norm": 1.992280125617981, + "learning_rate": 2.1495726495726497e-06, + "loss": 0.0206, + "step": 1837 + }, + { + "epoch": 15.709401709401709, + "grad_norm": 1.0176990032196045, + "learning_rate": 2.145299145299145e-06, + "loss": 0.0276, + "step": 1838 + }, + { + "epoch": 15.717948717948717, + "grad_norm": 1.6685941219329834, + "learning_rate": 2.141025641025641e-06, + "loss": 0.0222, + "step": 1839 + }, + { + "epoch": 15.726495726495726, + "grad_norm": 3.171050548553467, + "learning_rate": 2.136752136752137e-06, + "loss": 0.1526, + "step": 1840 + }, + { + "epoch": 15.735042735042736, + "grad_norm": 1.5068336725234985, + "learning_rate": 2.1324786324786324e-06, + "loss": 0.0271, + "step": 1841 + }, + { + "epoch": 15.743589743589745, + "grad_norm": 3.171870708465576, + "learning_rate": 2.1282051282051283e-06, + "loss": 0.0628, + "step": 1842 + }, + { + "epoch": 15.752136752136753, + "grad_norm": 1.9212791919708252, + "learning_rate": 2.123931623931624e-06, + "loss": 0.1018, + "step": 1843 + }, + { + "epoch": 15.760683760683762, + "grad_norm": 4.073456287384033, + "learning_rate": 2.1196581196581196e-06, + "loss": 0.1144, + "step": 1844 + }, + { + "epoch": 15.76923076923077, + "grad_norm": 1.8453985452651978, + "learning_rate": 2.1153846153846155e-06, + "loss": 0.0995, + "step": 1845 + }, + { + "epoch": 15.777777777777779, + "grad_norm": 3.285759210586548, + "learning_rate": 2.1111111111111114e-06, + "loss": 0.1173, + "step": 1846 + }, + { + "epoch": 15.786324786324787, + "grad_norm": 3.709202289581299, + "learning_rate": 2.106837606837607e-06, + "loss": 0.1906, + "step": 1847 + }, + { + "epoch": 15.794871794871796, + "grad_norm": 1.951262354850769, + "learning_rate": 2.1025641025641028e-06, + "loss": 0.0954, + "step": 1848 + }, + { + "epoch": 15.803418803418804, + "grad_norm": 3.249171257019043, + "learning_rate": 2.0982905982905987e-06, + "loss": 0.1258, + "step": 1849 + }, + { + "epoch": 15.811965811965813, + "grad_norm": 0.5708752274513245, + "learning_rate": 2.094017094017094e-06, + "loss": 0.0128, + "step": 1850 + }, + { + "epoch": 15.820512820512821, + "grad_norm": 3.2894484996795654, + "learning_rate": 2.08974358974359e-06, + "loss": 0.0621, + "step": 1851 + }, + { + "epoch": 15.82905982905983, + "grad_norm": 0.8564540147781372, + "learning_rate": 2.085470085470086e-06, + "loss": 0.0194, + "step": 1852 + }, + { + "epoch": 15.837606837606838, + "grad_norm": 3.319011926651001, + "learning_rate": 2.0811965811965814e-06, + "loss": 0.1413, + "step": 1853 + }, + { + "epoch": 15.846153846153847, + "grad_norm": 1.5385066270828247, + "learning_rate": 2.0769230769230773e-06, + "loss": 0.0316, + "step": 1854 + }, + { + "epoch": 15.854700854700855, + "grad_norm": 4.076297283172607, + "learning_rate": 2.072649572649573e-06, + "loss": 0.2257, + "step": 1855 + }, + { + "epoch": 15.863247863247864, + "grad_norm": 4.738671779632568, + "learning_rate": 2.068376068376068e-06, + "loss": 0.1627, + "step": 1856 + }, + { + "epoch": 15.871794871794872, + "grad_norm": 5.589550495147705, + "learning_rate": 2.064102564102564e-06, + "loss": 0.3182, + "step": 1857 + }, + { + "epoch": 15.88034188034188, + "grad_norm": 1.6303757429122925, + "learning_rate": 2.05982905982906e-06, + "loss": 0.0384, + "step": 1858 + }, + { + "epoch": 15.88888888888889, + "grad_norm": 3.0257458686828613, + "learning_rate": 2.0555555555555555e-06, + "loss": 0.0967, + "step": 1859 + }, + { + "epoch": 15.897435897435898, + "grad_norm": 2.4926559925079346, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.0703, + "step": 1860 + }, + { + "epoch": 15.905982905982906, + "grad_norm": 2.0784358978271484, + "learning_rate": 2.0470085470085472e-06, + "loss": 0.062, + "step": 1861 + }, + { + "epoch": 15.914529914529915, + "grad_norm": 4.92131233215332, + "learning_rate": 2.0427350427350427e-06, + "loss": 0.0875, + "step": 1862 + }, + { + "epoch": 15.923076923076923, + "grad_norm": 2.999511241912842, + "learning_rate": 2.0384615384615386e-06, + "loss": 0.0388, + "step": 1863 + }, + { + "epoch": 15.931623931623932, + "grad_norm": 5.770095348358154, + "learning_rate": 2.0341880341880345e-06, + "loss": 0.1257, + "step": 1864 + }, + { + "epoch": 15.94017094017094, + "grad_norm": 4.730950832366943, + "learning_rate": 2.02991452991453e-06, + "loss": 0.2386, + "step": 1865 + }, + { + "epoch": 15.948717948717949, + "grad_norm": 1.8125661611557007, + "learning_rate": 2.025641025641026e-06, + "loss": 0.0433, + "step": 1866 + }, + { + "epoch": 15.957264957264957, + "grad_norm": 5.433501243591309, + "learning_rate": 2.0213675213675217e-06, + "loss": 0.0536, + "step": 1867 + }, + { + "epoch": 15.965811965811966, + "grad_norm": 1.2565219402313232, + "learning_rate": 2.017094017094017e-06, + "loss": 0.0263, + "step": 1868 + }, + { + "epoch": 15.974358974358974, + "grad_norm": 1.5660192966461182, + "learning_rate": 2.012820512820513e-06, + "loss": 0.0387, + "step": 1869 + }, + { + "epoch": 15.982905982905983, + "grad_norm": 5.742929935455322, + "learning_rate": 2.008547008547009e-06, + "loss": 0.2158, + "step": 1870 + }, + { + "epoch": 15.991452991452991, + "grad_norm": 3.597506284713745, + "learning_rate": 2.0042735042735044e-06, + "loss": 0.0962, + "step": 1871 + }, + { + "epoch": 16.0, + "grad_norm": 1.753219485282898, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0193, + "step": 1872 + }, + { + "epoch": 16.0, + "eval_loss": 0.05589358136057854, + "eval_runtime": 9.2203, + "eval_samples_per_second": 50.541, + "eval_steps_per_second": 6.399, + "step": 1872 + }, + { + "epoch": 16.00854700854701, + "grad_norm": 9.627431869506836, + "learning_rate": 1.9957264957264962e-06, + "loss": 0.4748, + "step": 1873 + }, + { + "epoch": 16.017094017094017, + "grad_norm": 7.770556926727295, + "learning_rate": 1.9914529914529917e-06, + "loss": 0.2615, + "step": 1874 + }, + { + "epoch": 16.025641025641026, + "grad_norm": 1.7268822193145752, + "learning_rate": 1.987179487179487e-06, + "loss": 0.0808, + "step": 1875 + }, + { + "epoch": 16.034188034188034, + "grad_norm": 1.7209370136260986, + "learning_rate": 1.982905982905983e-06, + "loss": 0.0575, + "step": 1876 + }, + { + "epoch": 16.042735042735043, + "grad_norm": 2.6422786712646484, + "learning_rate": 1.9786324786324785e-06, + "loss": 0.0815, + "step": 1877 + }, + { + "epoch": 16.05128205128205, + "grad_norm": 0.9057373404502869, + "learning_rate": 1.9743589743589744e-06, + "loss": 0.0359, + "step": 1878 + }, + { + "epoch": 16.05982905982906, + "grad_norm": 1.4879076480865479, + "learning_rate": 1.9700854700854703e-06, + "loss": 0.0658, + "step": 1879 + }, + { + "epoch": 16.068376068376068, + "grad_norm": 2.1336488723754883, + "learning_rate": 1.9658119658119658e-06, + "loss": 0.0434, + "step": 1880 + }, + { + "epoch": 16.076923076923077, + "grad_norm": 2.642249822616577, + "learning_rate": 1.9615384615384617e-06, + "loss": 0.0768, + "step": 1881 + }, + { + "epoch": 16.085470085470085, + "grad_norm": 398.1800842285156, + "learning_rate": 1.9572649572649575e-06, + "loss": 1.7061, + "step": 1882 + }, + { + "epoch": 16.094017094017094, + "grad_norm": 1.6067556142807007, + "learning_rate": 1.952991452991453e-06, + "loss": 0.0492, + "step": 1883 + }, + { + "epoch": 16.102564102564102, + "grad_norm": 45.67499542236328, + "learning_rate": 1.948717948717949e-06, + "loss": 0.2883, + "step": 1884 + }, + { + "epoch": 16.11111111111111, + "grad_norm": 5.477624416351318, + "learning_rate": 1.944444444444445e-06, + "loss": 0.1107, + "step": 1885 + }, + { + "epoch": 16.11965811965812, + "grad_norm": 2.2795376777648926, + "learning_rate": 1.9401709401709403e-06, + "loss": 0.0427, + "step": 1886 + }, + { + "epoch": 16.128205128205128, + "grad_norm": 1.9572805166244507, + "learning_rate": 1.935897435897436e-06, + "loss": 0.04, + "step": 1887 + }, + { + "epoch": 16.136752136752136, + "grad_norm": 1.9205402135849, + "learning_rate": 1.931623931623932e-06, + "loss": 0.0384, + "step": 1888 + }, + { + "epoch": 16.145299145299145, + "grad_norm": 1.6124738454818726, + "learning_rate": 1.9273504273504275e-06, + "loss": 0.0322, + "step": 1889 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 3.3396270275115967, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.1302, + "step": 1890 + }, + { + "epoch": 16.162393162393162, + "grad_norm": 2.4800124168395996, + "learning_rate": 1.9188034188034193e-06, + "loss": 0.1181, + "step": 1891 + }, + { + "epoch": 16.17094017094017, + "grad_norm": 5.452153205871582, + "learning_rate": 1.9145299145299148e-06, + "loss": 0.2054, + "step": 1892 + }, + { + "epoch": 16.17948717948718, + "grad_norm": 4.445066452026367, + "learning_rate": 1.9102564102564102e-06, + "loss": 0.1649, + "step": 1893 + }, + { + "epoch": 16.188034188034187, + "grad_norm": 1.0402263402938843, + "learning_rate": 1.9059829059829061e-06, + "loss": 0.0285, + "step": 1894 + }, + { + "epoch": 16.196581196581196, + "grad_norm": 1.8124594688415527, + "learning_rate": 1.9017094017094018e-06, + "loss": 0.0717, + "step": 1895 + }, + { + "epoch": 16.205128205128204, + "grad_norm": 5.0620245933532715, + "learning_rate": 1.8974358974358975e-06, + "loss": 0.3833, + "step": 1896 + }, + { + "epoch": 16.213675213675213, + "grad_norm": 3.201596975326538, + "learning_rate": 1.8931623931623931e-06, + "loss": 0.0687, + "step": 1897 + }, + { + "epoch": 16.22222222222222, + "grad_norm": 0.9610732793807983, + "learning_rate": 1.888888888888889e-06, + "loss": 0.0165, + "step": 1898 + }, + { + "epoch": 16.23076923076923, + "grad_norm": 1.3409554958343506, + "learning_rate": 1.8846153846153847e-06, + "loss": 0.024, + "step": 1899 + }, + { + "epoch": 16.23931623931624, + "grad_norm": 1.2862681150436401, + "learning_rate": 1.8803418803418804e-06, + "loss": 0.042, + "step": 1900 + }, + { + "epoch": 16.247863247863247, + "grad_norm": 6.403625011444092, + "learning_rate": 1.8760683760683763e-06, + "loss": 0.5536, + "step": 1901 + }, + { + "epoch": 16.256410256410255, + "grad_norm": 3.241731882095337, + "learning_rate": 1.871794871794872e-06, + "loss": 0.1045, + "step": 1902 + }, + { + "epoch": 16.264957264957264, + "grad_norm": 1.1206634044647217, + "learning_rate": 1.8675213675213676e-06, + "loss": 0.0383, + "step": 1903 + }, + { + "epoch": 16.273504273504273, + "grad_norm": 3.3005762100219727, + "learning_rate": 1.8632478632478635e-06, + "loss": 0.0786, + "step": 1904 + }, + { + "epoch": 16.28205128205128, + "grad_norm": 0.44867634773254395, + "learning_rate": 1.8589743589743592e-06, + "loss": 0.0104, + "step": 1905 + }, + { + "epoch": 16.29059829059829, + "grad_norm": 2.7023422718048096, + "learning_rate": 1.8547008547008549e-06, + "loss": 0.1091, + "step": 1906 + }, + { + "epoch": 16.299145299145298, + "grad_norm": 0.9612734317779541, + "learning_rate": 1.8504273504273506e-06, + "loss": 0.0165, + "step": 1907 + }, + { + "epoch": 16.307692307692307, + "grad_norm": 3.0632894039154053, + "learning_rate": 1.8461538461538465e-06, + "loss": 0.1118, + "step": 1908 + }, + { + "epoch": 16.316239316239315, + "grad_norm": 3.932769775390625, + "learning_rate": 1.8418803418803421e-06, + "loss": 0.1084, + "step": 1909 + }, + { + "epoch": 16.324786324786324, + "grad_norm": 7.795356273651123, + "learning_rate": 1.8376068376068378e-06, + "loss": 0.2923, + "step": 1910 + }, + { + "epoch": 16.333333333333332, + "grad_norm": 1.4187766313552856, + "learning_rate": 1.8333333333333333e-06, + "loss": 0.0408, + "step": 1911 + }, + { + "epoch": 16.34188034188034, + "grad_norm": 1.1020699739456177, + "learning_rate": 1.8290598290598292e-06, + "loss": 0.0168, + "step": 1912 + }, + { + "epoch": 16.35042735042735, + "grad_norm": 0.9890375733375549, + "learning_rate": 1.8247863247863249e-06, + "loss": 0.0391, + "step": 1913 + }, + { + "epoch": 16.358974358974358, + "grad_norm": 39.418235778808594, + "learning_rate": 1.8205128205128205e-06, + "loss": 0.2804, + "step": 1914 + }, + { + "epoch": 16.367521367521366, + "grad_norm": 1.6613589525222778, + "learning_rate": 1.8162393162393164e-06, + "loss": 0.0475, + "step": 1915 + }, + { + "epoch": 16.376068376068375, + "grad_norm": 4.359612464904785, + "learning_rate": 1.811965811965812e-06, + "loss": 0.2247, + "step": 1916 + }, + { + "epoch": 16.384615384615383, + "grad_norm": 1.970078706741333, + "learning_rate": 1.8076923076923078e-06, + "loss": 0.03, + "step": 1917 + }, + { + "epoch": 16.39316239316239, + "grad_norm": 2.046025037765503, + "learning_rate": 1.8034188034188035e-06, + "loss": 0.0277, + "step": 1918 + }, + { + "epoch": 16.4017094017094, + "grad_norm": 1.5775028467178345, + "learning_rate": 1.7991452991452994e-06, + "loss": 0.0764, + "step": 1919 + }, + { + "epoch": 16.41025641025641, + "grad_norm": 2.8837273120880127, + "learning_rate": 1.794871794871795e-06, + "loss": 0.0903, + "step": 1920 + }, + { + "epoch": 16.418803418803417, + "grad_norm": 7.059972763061523, + "learning_rate": 1.7905982905982907e-06, + "loss": 0.0679, + "step": 1921 + }, + { + "epoch": 16.427350427350426, + "grad_norm": 3.6101839542388916, + "learning_rate": 1.7863247863247866e-06, + "loss": 0.1402, + "step": 1922 + }, + { + "epoch": 16.435897435897434, + "grad_norm": 2.3459484577178955, + "learning_rate": 1.7820512820512823e-06, + "loss": 0.0751, + "step": 1923 + }, + { + "epoch": 16.444444444444443, + "grad_norm": 2.0556280612945557, + "learning_rate": 1.777777777777778e-06, + "loss": 0.0452, + "step": 1924 + }, + { + "epoch": 16.45299145299145, + "grad_norm": 0.5339368581771851, + "learning_rate": 1.7735042735042736e-06, + "loss": 0.013, + "step": 1925 + }, + { + "epoch": 16.46153846153846, + "grad_norm": 1.393329381942749, + "learning_rate": 1.7692307692307695e-06, + "loss": 0.038, + "step": 1926 + }, + { + "epoch": 16.47008547008547, + "grad_norm": 0.9439583420753479, + "learning_rate": 1.7649572649572652e-06, + "loss": 0.0228, + "step": 1927 + }, + { + "epoch": 16.478632478632477, + "grad_norm": 3.437713384628296, + "learning_rate": 1.7606837606837609e-06, + "loss": 0.2072, + "step": 1928 + }, + { + "epoch": 16.487179487179485, + "grad_norm": 1.725557804107666, + "learning_rate": 1.7564102564102563e-06, + "loss": 0.0494, + "step": 1929 + }, + { + "epoch": 16.495726495726494, + "grad_norm": 2.4226529598236084, + "learning_rate": 1.7521367521367522e-06, + "loss": 0.0796, + "step": 1930 + }, + { + "epoch": 16.504273504273506, + "grad_norm": 36.0551643371582, + "learning_rate": 1.747863247863248e-06, + "loss": 0.1966, + "step": 1931 + }, + { + "epoch": 16.51282051282051, + "grad_norm": 0.8370515704154968, + "learning_rate": 1.7435897435897436e-06, + "loss": 0.0346, + "step": 1932 + }, + { + "epoch": 16.521367521367523, + "grad_norm": 2.486854314804077, + "learning_rate": 1.7393162393162395e-06, + "loss": 0.1423, + "step": 1933 + }, + { + "epoch": 16.52991452991453, + "grad_norm": 3.2457993030548096, + "learning_rate": 1.7350427350427352e-06, + "loss": 0.1894, + "step": 1934 + }, + { + "epoch": 16.53846153846154, + "grad_norm": 2.1744906902313232, + "learning_rate": 1.7307692307692308e-06, + "loss": 0.0889, + "step": 1935 + }, + { + "epoch": 16.54700854700855, + "grad_norm": 1.9443250894546509, + "learning_rate": 1.7264957264957265e-06, + "loss": 0.0413, + "step": 1936 + }, + { + "epoch": 16.555555555555557, + "grad_norm": 2.0389249324798584, + "learning_rate": 1.7222222222222224e-06, + "loss": 0.0798, + "step": 1937 + }, + { + "epoch": 16.564102564102566, + "grad_norm": 4.600223064422607, + "learning_rate": 1.717948717948718e-06, + "loss": 0.0706, + "step": 1938 + }, + { + "epoch": 16.572649572649574, + "grad_norm": 1.4231921434402466, + "learning_rate": 1.7136752136752138e-06, + "loss": 0.0856, + "step": 1939 + }, + { + "epoch": 16.581196581196583, + "grad_norm": 4.8655290603637695, + "learning_rate": 1.7094017094017097e-06, + "loss": 0.2519, + "step": 1940 + }, + { + "epoch": 16.58974358974359, + "grad_norm": 2.6834962368011475, + "learning_rate": 1.7051282051282053e-06, + "loss": 0.0328, + "step": 1941 + }, + { + "epoch": 16.5982905982906, + "grad_norm": 0.625557541847229, + "learning_rate": 1.700854700854701e-06, + "loss": 0.0129, + "step": 1942 + }, + { + "epoch": 16.60683760683761, + "grad_norm": 10.57834243774414, + "learning_rate": 1.6965811965811967e-06, + "loss": 0.2987, + "step": 1943 + }, + { + "epoch": 16.615384615384617, + "grad_norm": 1.2357791662216187, + "learning_rate": 1.6923076923076926e-06, + "loss": 0.0294, + "step": 1944 + }, + { + "epoch": 16.623931623931625, + "grad_norm": 1.8380581140518188, + "learning_rate": 1.6880341880341883e-06, + "loss": 0.0298, + "step": 1945 + }, + { + "epoch": 16.632478632478634, + "grad_norm": 1.2370020151138306, + "learning_rate": 1.683760683760684e-06, + "loss": 0.0285, + "step": 1946 + }, + { + "epoch": 16.641025641025642, + "grad_norm": 5.922267913818359, + "learning_rate": 1.6794871794871794e-06, + "loss": 0.24, + "step": 1947 + }, + { + "epoch": 16.64957264957265, + "grad_norm": 2.439023494720459, + "learning_rate": 1.6752136752136753e-06, + "loss": 0.0988, + "step": 1948 + }, + { + "epoch": 16.65811965811966, + "grad_norm": 0.8908723592758179, + "learning_rate": 1.670940170940171e-06, + "loss": 0.026, + "step": 1949 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 0.8728394508361816, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.018, + "step": 1950 + }, + { + "epoch": 16.675213675213676, + "grad_norm": 2.7304019927978516, + "learning_rate": 1.6623931623931626e-06, + "loss": 0.1567, + "step": 1951 + }, + { + "epoch": 16.683760683760685, + "grad_norm": 2.8601150512695312, + "learning_rate": 1.6581196581196582e-06, + "loss": 0.0721, + "step": 1952 + }, + { + "epoch": 16.692307692307693, + "grad_norm": 2.5990025997161865, + "learning_rate": 1.653846153846154e-06, + "loss": 0.2296, + "step": 1953 + }, + { + "epoch": 16.700854700854702, + "grad_norm": 3.7956109046936035, + "learning_rate": 1.6495726495726496e-06, + "loss": 0.2565, + "step": 1954 + }, + { + "epoch": 16.70940170940171, + "grad_norm": 5.933072566986084, + "learning_rate": 1.6452991452991455e-06, + "loss": 0.2712, + "step": 1955 + }, + { + "epoch": 16.71794871794872, + "grad_norm": 0.5651862621307373, + "learning_rate": 1.6410256410256412e-06, + "loss": 0.0132, + "step": 1956 + }, + { + "epoch": 16.726495726495727, + "grad_norm": 3.033231735229492, + "learning_rate": 1.6367521367521368e-06, + "loss": 0.074, + "step": 1957 + }, + { + "epoch": 16.735042735042736, + "grad_norm": 1.3515870571136475, + "learning_rate": 1.6324786324786327e-06, + "loss": 0.0614, + "step": 1958 + }, + { + "epoch": 16.743589743589745, + "grad_norm": 3.091700792312622, + "learning_rate": 1.6282051282051284e-06, + "loss": 0.1284, + "step": 1959 + }, + { + "epoch": 16.752136752136753, + "grad_norm": 7.142216205596924, + "learning_rate": 1.623931623931624e-06, + "loss": 0.1965, + "step": 1960 + }, + { + "epoch": 16.76068376068376, + "grad_norm": 7.488593578338623, + "learning_rate": 1.6196581196581198e-06, + "loss": 0.2498, + "step": 1961 + }, + { + "epoch": 16.76923076923077, + "grad_norm": 3.943833351135254, + "learning_rate": 1.6153846153846157e-06, + "loss": 0.0967, + "step": 1962 + }, + { + "epoch": 16.77777777777778, + "grad_norm": 1.8732318878173828, + "learning_rate": 1.6111111111111113e-06, + "loss": 0.029, + "step": 1963 + }, + { + "epoch": 16.786324786324787, + "grad_norm": 2.5445902347564697, + "learning_rate": 1.606837606837607e-06, + "loss": 0.0808, + "step": 1964 + }, + { + "epoch": 16.794871794871796, + "grad_norm": 4.969367504119873, + "learning_rate": 1.602564102564103e-06, + "loss": 0.164, + "step": 1965 + }, + { + "epoch": 16.803418803418804, + "grad_norm": 1.6954468488693237, + "learning_rate": 1.5982905982905984e-06, + "loss": 0.0645, + "step": 1966 + }, + { + "epoch": 16.811965811965813, + "grad_norm": 1.536352276802063, + "learning_rate": 1.594017094017094e-06, + "loss": 0.0595, + "step": 1967 + }, + { + "epoch": 16.82051282051282, + "grad_norm": 0.7326592803001404, + "learning_rate": 1.5897435897435897e-06, + "loss": 0.0153, + "step": 1968 + }, + { + "epoch": 16.82905982905983, + "grad_norm": 10.959025382995605, + "learning_rate": 1.5854700854700856e-06, + "loss": 0.3274, + "step": 1969 + }, + { + "epoch": 16.837606837606838, + "grad_norm": 10.305845260620117, + "learning_rate": 1.5811965811965813e-06, + "loss": 0.1404, + "step": 1970 + }, + { + "epoch": 16.846153846153847, + "grad_norm": 7.498697280883789, + "learning_rate": 1.576923076923077e-06, + "loss": 0.2269, + "step": 1971 + }, + { + "epoch": 16.854700854700855, + "grad_norm": 0.29253125190734863, + "learning_rate": 1.5726495726495727e-06, + "loss": 0.0074, + "step": 1972 + }, + { + "epoch": 16.863247863247864, + "grad_norm": 9.320234298706055, + "learning_rate": 1.5683760683760685e-06, + "loss": 0.067, + "step": 1973 + }, + { + "epoch": 16.871794871794872, + "grad_norm": 6.572272300720215, + "learning_rate": 1.5641025641025642e-06, + "loss": 0.4577, + "step": 1974 + }, + { + "epoch": 16.88034188034188, + "grad_norm": 5.368937969207764, + "learning_rate": 1.55982905982906e-06, + "loss": 0.2016, + "step": 1975 + }, + { + "epoch": 16.88888888888889, + "grad_norm": 0.5891698598861694, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.0174, + "step": 1976 + }, + { + "epoch": 16.897435897435898, + "grad_norm": 3.045989751815796, + "learning_rate": 1.5512820512820515e-06, + "loss": 0.1748, + "step": 1977 + }, + { + "epoch": 16.905982905982906, + "grad_norm": 3.013834238052368, + "learning_rate": 1.5470085470085471e-06, + "loss": 0.2283, + "step": 1978 + }, + { + "epoch": 16.914529914529915, + "grad_norm": 1.2644447088241577, + "learning_rate": 1.5427350427350428e-06, + "loss": 0.0302, + "step": 1979 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 4.429958820343018, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.2458, + "step": 1980 + }, + { + "epoch": 16.931623931623932, + "grad_norm": 1.1556981801986694, + "learning_rate": 1.5341880341880344e-06, + "loss": 0.0179, + "step": 1981 + }, + { + "epoch": 16.94017094017094, + "grad_norm": 1.4588316679000854, + "learning_rate": 1.52991452991453e-06, + "loss": 0.1063, + "step": 1982 + }, + { + "epoch": 16.94871794871795, + "grad_norm": 1.124496340751648, + "learning_rate": 1.525641025641026e-06, + "loss": 0.0278, + "step": 1983 + }, + { + "epoch": 16.957264957264957, + "grad_norm": 0.7231981754302979, + "learning_rate": 1.5213675213675214e-06, + "loss": 0.0141, + "step": 1984 + }, + { + "epoch": 16.965811965811966, + "grad_norm": 1.4819642305374146, + "learning_rate": 1.5170940170940171e-06, + "loss": 0.0601, + "step": 1985 + }, + { + "epoch": 16.974358974358974, + "grad_norm": 0.7296791672706604, + "learning_rate": 1.5128205128205128e-06, + "loss": 0.0215, + "step": 1986 + }, + { + "epoch": 16.982905982905983, + "grad_norm": 15.651564598083496, + "learning_rate": 1.5085470085470087e-06, + "loss": 0.2954, + "step": 1987 + }, + { + "epoch": 16.99145299145299, + "grad_norm": 0.48891735076904297, + "learning_rate": 1.5042735042735044e-06, + "loss": 0.015, + "step": 1988 + }, + { + "epoch": 17.0, + "grad_norm": 7.363093376159668, + "learning_rate": 1.5e-06, + "loss": 0.2366, + "step": 1989 + }, + { + "epoch": 17.0, + "eval_loss": 0.05406723916530609, + "eval_runtime": 9.389, + "eval_samples_per_second": 49.633, + "eval_steps_per_second": 6.284, + "step": 1989 + }, + { + "epoch": 17.00854700854701, + "grad_norm": 2.8626017570495605, + "learning_rate": 1.4957264957264957e-06, + "loss": 0.0902, + "step": 1990 + }, + { + "epoch": 17.017094017094017, + "grad_norm": 2.461879253387451, + "learning_rate": 1.4914529914529916e-06, + "loss": 0.0387, + "step": 1991 + }, + { + "epoch": 17.025641025641026, + "grad_norm": 6.336863994598389, + "learning_rate": 1.4871794871794873e-06, + "loss": 0.196, + "step": 1992 + }, + { + "epoch": 17.034188034188034, + "grad_norm": 1.1044467687606812, + "learning_rate": 1.482905982905983e-06, + "loss": 0.0352, + "step": 1993 + }, + { + "epoch": 17.042735042735043, + "grad_norm": 3.3509342670440674, + "learning_rate": 1.4786324786324789e-06, + "loss": 0.1459, + "step": 1994 + }, + { + "epoch": 17.05128205128205, + "grad_norm": 3.2349629402160645, + "learning_rate": 1.4743589743589745e-06, + "loss": 0.0179, + "step": 1995 + }, + { + "epoch": 17.05982905982906, + "grad_norm": 3.650749921798706, + "learning_rate": 1.4700854700854702e-06, + "loss": 0.1549, + "step": 1996 + }, + { + "epoch": 17.068376068376068, + "grad_norm": 1.6349891424179077, + "learning_rate": 1.465811965811966e-06, + "loss": 0.0713, + "step": 1997 + }, + { + "epoch": 17.076923076923077, + "grad_norm": 8.602070808410645, + "learning_rate": 1.4615384615384618e-06, + "loss": 0.3582, + "step": 1998 + }, + { + "epoch": 17.085470085470085, + "grad_norm": 3.1162590980529785, + "learning_rate": 1.4572649572649575e-06, + "loss": 0.2455, + "step": 1999 + }, + { + "epoch": 17.094017094017094, + "grad_norm": 1.4878407716751099, + "learning_rate": 1.4529914529914531e-06, + "loss": 0.0195, + "step": 2000 + }, + { + "epoch": 17.102564102564102, + "grad_norm": 2.565297842025757, + "learning_rate": 1.448717948717949e-06, + "loss": 0.1126, + "step": 2001 + }, + { + "epoch": 17.11111111111111, + "grad_norm": 4.169450759887695, + "learning_rate": 1.4444444444444445e-06, + "loss": 0.1774, + "step": 2002 + }, + { + "epoch": 17.11965811965812, + "grad_norm": 1.8476792573928833, + "learning_rate": 1.4401709401709402e-06, + "loss": 0.0288, + "step": 2003 + }, + { + "epoch": 17.128205128205128, + "grad_norm": 0.7279506921768188, + "learning_rate": 1.4358974358974359e-06, + "loss": 0.0217, + "step": 2004 + }, + { + "epoch": 17.136752136752136, + "grad_norm": 7.387227535247803, + "learning_rate": 1.4316239316239317e-06, + "loss": 0.248, + "step": 2005 + }, + { + "epoch": 17.145299145299145, + "grad_norm": 2.9455361366271973, + "learning_rate": 1.4273504273504274e-06, + "loss": 0.0439, + "step": 2006 + }, + { + "epoch": 17.153846153846153, + "grad_norm": 6.015694618225098, + "learning_rate": 1.423076923076923e-06, + "loss": 0.0656, + "step": 2007 + }, + { + "epoch": 17.162393162393162, + "grad_norm": 1.741774320602417, + "learning_rate": 1.4188034188034188e-06, + "loss": 0.0344, + "step": 2008 + }, + { + "epoch": 17.17094017094017, + "grad_norm": 0.5282659530639648, + "learning_rate": 1.4145299145299147e-06, + "loss": 0.0128, + "step": 2009 + }, + { + "epoch": 17.17948717948718, + "grad_norm": 2.4927468299865723, + "learning_rate": 1.4102564102564104e-06, + "loss": 0.1839, + "step": 2010 + }, + { + "epoch": 17.188034188034187, + "grad_norm": 0.7872166037559509, + "learning_rate": 1.405982905982906e-06, + "loss": 0.0204, + "step": 2011 + }, + { + "epoch": 17.196581196581196, + "grad_norm": 0.7072253227233887, + "learning_rate": 1.401709401709402e-06, + "loss": 0.0206, + "step": 2012 + }, + { + "epoch": 17.205128205128204, + "grad_norm": 1.0154236555099487, + "learning_rate": 1.3974358974358976e-06, + "loss": 0.0238, + "step": 2013 + }, + { + "epoch": 17.213675213675213, + "grad_norm": 2.9798424243927, + "learning_rate": 1.3931623931623933e-06, + "loss": 0.0542, + "step": 2014 + }, + { + "epoch": 17.22222222222222, + "grad_norm": 0.9568426012992859, + "learning_rate": 1.3888888888888892e-06, + "loss": 0.0239, + "step": 2015 + }, + { + "epoch": 17.23076923076923, + "grad_norm": 10.525039672851562, + "learning_rate": 1.3846153846153848e-06, + "loss": 0.1768, + "step": 2016 + }, + { + "epoch": 17.23931623931624, + "grad_norm": 1.697314977645874, + "learning_rate": 1.3803418803418805e-06, + "loss": 0.0453, + "step": 2017 + }, + { + "epoch": 17.247863247863247, + "grad_norm": 0.6436419486999512, + "learning_rate": 1.3760683760683762e-06, + "loss": 0.0163, + "step": 2018 + }, + { + "epoch": 17.256410256410255, + "grad_norm": 4.984555721282959, + "learning_rate": 1.371794871794872e-06, + "loss": 0.1157, + "step": 2019 + }, + { + "epoch": 17.264957264957264, + "grad_norm": 9.088909149169922, + "learning_rate": 1.3675213675213678e-06, + "loss": 0.2842, + "step": 2020 + }, + { + "epoch": 17.273504273504273, + "grad_norm": 10.398246765136719, + "learning_rate": 1.3632478632478632e-06, + "loss": 0.2528, + "step": 2021 + }, + { + "epoch": 17.28205128205128, + "grad_norm": 3.60273814201355, + "learning_rate": 1.358974358974359e-06, + "loss": 0.1799, + "step": 2022 + }, + { + "epoch": 17.29059829059829, + "grad_norm": 0.6845250129699707, + "learning_rate": 1.3547008547008548e-06, + "loss": 0.0196, + "step": 2023 + }, + { + "epoch": 17.299145299145298, + "grad_norm": 0.5363795161247253, + "learning_rate": 1.3504273504273505e-06, + "loss": 0.0136, + "step": 2024 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 3.880434274673462, + "learning_rate": 1.3461538461538462e-06, + "loss": 0.3665, + "step": 2025 + }, + { + "epoch": 17.316239316239315, + "grad_norm": 4.580989360809326, + "learning_rate": 1.3418803418803418e-06, + "loss": 0.2593, + "step": 2026 + }, + { + "epoch": 17.324786324786324, + "grad_norm": 2.781501293182373, + "learning_rate": 1.3376068376068377e-06, + "loss": 0.1777, + "step": 2027 + }, + { + "epoch": 17.333333333333332, + "grad_norm": 5.605004787445068, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.3633, + "step": 2028 + }, + { + "epoch": 17.34188034188034, + "grad_norm": 1.696486473083496, + "learning_rate": 1.329059829059829e-06, + "loss": 0.0353, + "step": 2029 + }, + { + "epoch": 17.35042735042735, + "grad_norm": 3.4415268898010254, + "learning_rate": 1.324786324786325e-06, + "loss": 0.0906, + "step": 2030 + }, + { + "epoch": 17.358974358974358, + "grad_norm": 7.722592353820801, + "learning_rate": 1.3205128205128207e-06, + "loss": 0.1804, + "step": 2031 + }, + { + "epoch": 17.367521367521366, + "grad_norm": 3.3161542415618896, + "learning_rate": 1.3162393162393163e-06, + "loss": 0.1336, + "step": 2032 + }, + { + "epoch": 17.376068376068375, + "grad_norm": 2.568871021270752, + "learning_rate": 1.3119658119658122e-06, + "loss": 0.0658, + "step": 2033 + }, + { + "epoch": 17.384615384615383, + "grad_norm": 3.5799806118011475, + "learning_rate": 1.307692307692308e-06, + "loss": 0.0652, + "step": 2034 + }, + { + "epoch": 17.39316239316239, + "grad_norm": 1.1399949789047241, + "learning_rate": 1.3034188034188036e-06, + "loss": 0.0196, + "step": 2035 + }, + { + "epoch": 17.4017094017094, + "grad_norm": 2.3688738346099854, + "learning_rate": 1.2991452991452993e-06, + "loss": 0.0706, + "step": 2036 + }, + { + "epoch": 17.41025641025641, + "grad_norm": 12.726486206054688, + "learning_rate": 1.2948717948717952e-06, + "loss": 0.2506, + "step": 2037 + }, + { + "epoch": 17.418803418803417, + "grad_norm": 2.249285936355591, + "learning_rate": 1.2905982905982908e-06, + "loss": 0.0532, + "step": 2038 + }, + { + "epoch": 17.427350427350426, + "grad_norm": 0.7129601836204529, + "learning_rate": 1.2863247863247863e-06, + "loss": 0.0207, + "step": 2039 + }, + { + "epoch": 17.435897435897434, + "grad_norm": 1.9362183809280396, + "learning_rate": 1.282051282051282e-06, + "loss": 0.0311, + "step": 2040 + }, + { + "epoch": 17.444444444444443, + "grad_norm": 2.253690242767334, + "learning_rate": 1.2777777777777779e-06, + "loss": 0.1203, + "step": 2041 + }, + { + "epoch": 17.45299145299145, + "grad_norm": 3.835174798965454, + "learning_rate": 1.2735042735042736e-06, + "loss": 0.0928, + "step": 2042 + }, + { + "epoch": 17.46153846153846, + "grad_norm": 143.36563110351562, + "learning_rate": 1.2692307692307692e-06, + "loss": 0.2984, + "step": 2043 + }, + { + "epoch": 17.47008547008547, + "grad_norm": 0.6122754216194153, + "learning_rate": 1.264957264957265e-06, + "loss": 0.0171, + "step": 2044 + }, + { + "epoch": 17.478632478632477, + "grad_norm": 3.0697991847991943, + "learning_rate": 1.2606837606837608e-06, + "loss": 0.1412, + "step": 2045 + }, + { + "epoch": 17.487179487179485, + "grad_norm": 1.0684096813201904, + "learning_rate": 1.2564102564102565e-06, + "loss": 0.0278, + "step": 2046 + }, + { + "epoch": 17.495726495726494, + "grad_norm": 5.379480838775635, + "learning_rate": 1.2521367521367522e-06, + "loss": 0.1114, + "step": 2047 + }, + { + "epoch": 17.504273504273506, + "grad_norm": 3.893343448638916, + "learning_rate": 1.247863247863248e-06, + "loss": 0.1499, + "step": 2048 + }, + { + "epoch": 17.51282051282051, + "grad_norm": 1.0436211824417114, + "learning_rate": 1.2435897435897437e-06, + "loss": 0.0259, + "step": 2049 + }, + { + "epoch": 17.521367521367523, + "grad_norm": 2.8706037998199463, + "learning_rate": 1.2393162393162394e-06, + "loss": 0.1071, + "step": 2050 + }, + { + "epoch": 17.52991452991453, + "grad_norm": 1.5661158561706543, + "learning_rate": 1.2350427350427353e-06, + "loss": 0.0392, + "step": 2051 + }, + { + "epoch": 17.53846153846154, + "grad_norm": 3.7152199745178223, + "learning_rate": 1.230769230769231e-06, + "loss": 0.0698, + "step": 2052 + }, + { + "epoch": 17.54700854700855, + "grad_norm": 2.6527271270751953, + "learning_rate": 1.2264957264957264e-06, + "loss": 0.1276, + "step": 2053 + }, + { + "epoch": 17.555555555555557, + "grad_norm": 0.9018534421920776, + "learning_rate": 1.2222222222222223e-06, + "loss": 0.066, + "step": 2054 + }, + { + "epoch": 17.564102564102566, + "grad_norm": 7.11035680770874, + "learning_rate": 1.217948717948718e-06, + "loss": 0.0836, + "step": 2055 + }, + { + "epoch": 17.572649572649574, + "grad_norm": 2.5168066024780273, + "learning_rate": 1.2136752136752137e-06, + "loss": 0.0662, + "step": 2056 + }, + { + "epoch": 17.581196581196583, + "grad_norm": 0.7215616703033447, + "learning_rate": 1.2094017094017096e-06, + "loss": 0.0186, + "step": 2057 + }, + { + "epoch": 17.58974358974359, + "grad_norm": 7.076876640319824, + "learning_rate": 1.2051282051282053e-06, + "loss": 0.1493, + "step": 2058 + }, + { + "epoch": 17.5982905982906, + "grad_norm": 1.1687662601470947, + "learning_rate": 1.200854700854701e-06, + "loss": 0.0368, + "step": 2059 + }, + { + "epoch": 17.60683760683761, + "grad_norm": 2.5085737705230713, + "learning_rate": 1.1965811965811968e-06, + "loss": 0.1567, + "step": 2060 + }, + { + "epoch": 17.615384615384617, + "grad_norm": 0.43566644191741943, + "learning_rate": 1.1923076923076925e-06, + "loss": 0.0097, + "step": 2061 + }, + { + "epoch": 17.623931623931625, + "grad_norm": 0.7698078155517578, + "learning_rate": 1.188034188034188e-06, + "loss": 0.0231, + "step": 2062 + }, + { + "epoch": 17.632478632478634, + "grad_norm": 1.8352185487747192, + "learning_rate": 1.1837606837606839e-06, + "loss": 0.0324, + "step": 2063 + }, + { + "epoch": 17.641025641025642, + "grad_norm": 12.11907958984375, + "learning_rate": 1.1794871794871795e-06, + "loss": 0.6052, + "step": 2064 + }, + { + "epoch": 17.64957264957265, + "grad_norm": 0.49942728877067566, + "learning_rate": 1.1752136752136752e-06, + "loss": 0.0111, + "step": 2065 + }, + { + "epoch": 17.65811965811966, + "grad_norm": 3.579129457473755, + "learning_rate": 1.1709401709401711e-06, + "loss": 0.1706, + "step": 2066 + }, + { + "epoch": 17.666666666666668, + "grad_norm": 2.112550973892212, + "learning_rate": 1.1666666666666668e-06, + "loss": 0.0438, + "step": 2067 + }, + { + "epoch": 17.675213675213676, + "grad_norm": 2.4429895877838135, + "learning_rate": 1.1623931623931625e-06, + "loss": 0.0498, + "step": 2068 + }, + { + "epoch": 17.683760683760685, + "grad_norm": 1.8436684608459473, + "learning_rate": 1.1581196581196584e-06, + "loss": 0.1228, + "step": 2069 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 4.679569244384766, + "learning_rate": 1.153846153846154e-06, + "loss": 0.1505, + "step": 2070 + }, + { + "epoch": 17.700854700854702, + "grad_norm": 2.4409713745117188, + "learning_rate": 1.1495726495726495e-06, + "loss": 0.0603, + "step": 2071 + }, + { + "epoch": 17.70940170940171, + "grad_norm": 3.577721118927002, + "learning_rate": 1.1452991452991454e-06, + "loss": 0.1078, + "step": 2072 + }, + { + "epoch": 17.71794871794872, + "grad_norm": 3.774958372116089, + "learning_rate": 1.141025641025641e-06, + "loss": 0.3782, + "step": 2073 + }, + { + "epoch": 17.726495726495727, + "grad_norm": 2.9011383056640625, + "learning_rate": 1.1367521367521368e-06, + "loss": 0.0714, + "step": 2074 + }, + { + "epoch": 17.735042735042736, + "grad_norm": 1.7296162843704224, + "learning_rate": 1.1324786324786326e-06, + "loss": 0.0463, + "step": 2075 + }, + { + "epoch": 17.743589743589745, + "grad_norm": 1.8955838680267334, + "learning_rate": 1.1282051282051283e-06, + "loss": 0.0641, + "step": 2076 + }, + { + "epoch": 17.752136752136753, + "grad_norm": 3.0198490619659424, + "learning_rate": 1.123931623931624e-06, + "loss": 0.1516, + "step": 2077 + }, + { + "epoch": 17.76068376068376, + "grad_norm": 1.5012823343276978, + "learning_rate": 1.1196581196581199e-06, + "loss": 0.0206, + "step": 2078 + }, + { + "epoch": 17.76923076923077, + "grad_norm": 2.4390790462493896, + "learning_rate": 1.1153846153846156e-06, + "loss": 0.0458, + "step": 2079 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 5.728135585784912, + "learning_rate": 1.111111111111111e-06, + "loss": 0.0443, + "step": 2080 + }, + { + "epoch": 17.786324786324787, + "grad_norm": 1.423771858215332, + "learning_rate": 1.106837606837607e-06, + "loss": 0.0223, + "step": 2081 + }, + { + "epoch": 17.794871794871796, + "grad_norm": 2.524941921234131, + "learning_rate": 1.1025641025641026e-06, + "loss": 0.0587, + "step": 2082 + }, + { + "epoch": 17.803418803418804, + "grad_norm": 0.9632331132888794, + "learning_rate": 1.0982905982905983e-06, + "loss": 0.0324, + "step": 2083 + }, + { + "epoch": 17.811965811965813, + "grad_norm": 1.8369181156158447, + "learning_rate": 1.0940170940170942e-06, + "loss": 0.0182, + "step": 2084 + }, + { + "epoch": 17.82051282051282, + "grad_norm": 2.547654867172241, + "learning_rate": 1.0897435897435899e-06, + "loss": 0.1395, + "step": 2085 + }, + { + "epoch": 17.82905982905983, + "grad_norm": 3.516977310180664, + "learning_rate": 1.0854700854700855e-06, + "loss": 0.1044, + "step": 2086 + }, + { + "epoch": 17.837606837606838, + "grad_norm": 1.7064217329025269, + "learning_rate": 1.0811965811965814e-06, + "loss": 0.0302, + "step": 2087 + }, + { + "epoch": 17.846153846153847, + "grad_norm": 1.7427505254745483, + "learning_rate": 1.076923076923077e-06, + "loss": 0.0298, + "step": 2088 + }, + { + "epoch": 17.854700854700855, + "grad_norm": 1.3395370244979858, + "learning_rate": 1.0726495726495726e-06, + "loss": 0.0302, + "step": 2089 + }, + { + "epoch": 17.863247863247864, + "grad_norm": 7.244344711303711, + "learning_rate": 1.0683760683760685e-06, + "loss": 0.1925, + "step": 2090 + }, + { + "epoch": 17.871794871794872, + "grad_norm": 5.942878723144531, + "learning_rate": 1.0641025641025641e-06, + "loss": 0.489, + "step": 2091 + }, + { + "epoch": 17.88034188034188, + "grad_norm": 3.244260787963867, + "learning_rate": 1.0598290598290598e-06, + "loss": 0.2538, + "step": 2092 + }, + { + "epoch": 17.88888888888889, + "grad_norm": 0.9833334684371948, + "learning_rate": 1.0555555555555557e-06, + "loss": 0.0215, + "step": 2093 + }, + { + "epoch": 17.897435897435898, + "grad_norm": 3.0194849967956543, + "learning_rate": 1.0512820512820514e-06, + "loss": 0.07, + "step": 2094 + }, + { + "epoch": 17.905982905982906, + "grad_norm": 0.48535388708114624, + "learning_rate": 1.047008547008547e-06, + "loss": 0.0113, + "step": 2095 + }, + { + "epoch": 17.914529914529915, + "grad_norm": 4.334452152252197, + "learning_rate": 1.042735042735043e-06, + "loss": 0.127, + "step": 2096 + }, + { + "epoch": 17.923076923076923, + "grad_norm": 3.54429030418396, + "learning_rate": 1.0384615384615386e-06, + "loss": 0.0704, + "step": 2097 + }, + { + "epoch": 17.931623931623932, + "grad_norm": 1.1745219230651855, + "learning_rate": 1.034188034188034e-06, + "loss": 0.0418, + "step": 2098 + }, + { + "epoch": 17.94017094017094, + "grad_norm": 5.157544136047363, + "learning_rate": 1.02991452991453e-06, + "loss": 0.2562, + "step": 2099 + }, + { + "epoch": 17.94871794871795, + "grad_norm": 4.454767227172852, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.1141, + "step": 2100 + }, + { + "epoch": 17.957264957264957, + "grad_norm": 12.859573364257812, + "learning_rate": 1.0213675213675213e-06, + "loss": 0.3516, + "step": 2101 + }, + { + "epoch": 17.965811965811966, + "grad_norm": 5.780513763427734, + "learning_rate": 1.0170940170940172e-06, + "loss": 0.1663, + "step": 2102 + }, + { + "epoch": 17.974358974358974, + "grad_norm": 2.762153387069702, + "learning_rate": 1.012820512820513e-06, + "loss": 0.19, + "step": 2103 + }, + { + "epoch": 17.982905982905983, + "grad_norm": 5.649252891540527, + "learning_rate": 1.0085470085470086e-06, + "loss": 0.1736, + "step": 2104 + }, + { + "epoch": 17.99145299145299, + "grad_norm": 5.10836124420166, + "learning_rate": 1.0042735042735045e-06, + "loss": 0.1739, + "step": 2105 + }, + { + "epoch": 18.0, + "grad_norm": 6.474237442016602, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.3239, + "step": 2106 + }, + { + "epoch": 18.0, + "eval_loss": 0.052614517509937286, + "eval_runtime": 9.28, + "eval_samples_per_second": 50.216, + "eval_steps_per_second": 6.358, + "step": 2106 + }, + { + "epoch": 18.00854700854701, + "grad_norm": 0.8820164203643799, + "learning_rate": 9.957264957264958e-07, + "loss": 0.0237, + "step": 2107 + }, + { + "epoch": 18.017094017094017, + "grad_norm": 2.692166566848755, + "learning_rate": 9.914529914529915e-07, + "loss": 0.0962, + "step": 2108 + }, + { + "epoch": 18.025641025641026, + "grad_norm": 0.8048399090766907, + "learning_rate": 9.871794871794872e-07, + "loss": 0.0232, + "step": 2109 + }, + { + "epoch": 18.034188034188034, + "grad_norm": 4.4439826011657715, + "learning_rate": 9.829059829059829e-07, + "loss": 0.064, + "step": 2110 + }, + { + "epoch": 18.042735042735043, + "grad_norm": 1.62433660030365, + "learning_rate": 9.786324786324788e-07, + "loss": 0.1263, + "step": 2111 + }, + { + "epoch": 18.05128205128205, + "grad_norm": 4.766104221343994, + "learning_rate": 9.743589743589745e-07, + "loss": 0.2108, + "step": 2112 + }, + { + "epoch": 18.05982905982906, + "grad_norm": 139.34445190429688, + "learning_rate": 9.700854700854701e-07, + "loss": 0.237, + "step": 2113 + }, + { + "epoch": 18.068376068376068, + "grad_norm": 0.6069220900535583, + "learning_rate": 9.65811965811966e-07, + "loss": 0.0135, + "step": 2114 + }, + { + "epoch": 18.076923076923077, + "grad_norm": 2.7833995819091797, + "learning_rate": 9.615384615384617e-07, + "loss": 0.1677, + "step": 2115 + }, + { + "epoch": 18.085470085470085, + "grad_norm": 4.570268630981445, + "learning_rate": 9.572649572649574e-07, + "loss": 0.2304, + "step": 2116 + }, + { + "epoch": 18.094017094017094, + "grad_norm": 4.7644805908203125, + "learning_rate": 9.529914529914531e-07, + "loss": 0.138, + "step": 2117 + }, + { + "epoch": 18.102564102564102, + "grad_norm": 1.9438762664794922, + "learning_rate": 9.487179487179487e-07, + "loss": 0.0488, + "step": 2118 + }, + { + "epoch": 18.11111111111111, + "grad_norm": 1.4188040494918823, + "learning_rate": 9.444444444444445e-07, + "loss": 0.0545, + "step": 2119 + }, + { + "epoch": 18.11965811965812, + "grad_norm": 0.357928603887558, + "learning_rate": 9.401709401709402e-07, + "loss": 0.0092, + "step": 2120 + }, + { + "epoch": 18.128205128205128, + "grad_norm": 1.8646256923675537, + "learning_rate": 9.35897435897436e-07, + "loss": 0.086, + "step": 2121 + }, + { + "epoch": 18.136752136752136, + "grad_norm": 2.111544609069824, + "learning_rate": 9.316239316239318e-07, + "loss": 0.0319, + "step": 2122 + }, + { + "epoch": 18.145299145299145, + "grad_norm": 3.0686893463134766, + "learning_rate": 9.273504273504274e-07, + "loss": 0.0689, + "step": 2123 + }, + { + "epoch": 18.153846153846153, + "grad_norm": 4.028079509735107, + "learning_rate": 9.230769230769232e-07, + "loss": 0.125, + "step": 2124 + }, + { + "epoch": 18.162393162393162, + "grad_norm": 1.0433181524276733, + "learning_rate": 9.188034188034189e-07, + "loss": 0.0174, + "step": 2125 + }, + { + "epoch": 18.17094017094017, + "grad_norm": 3.4533402919769287, + "learning_rate": 9.145299145299146e-07, + "loss": 0.1556, + "step": 2126 + }, + { + "epoch": 18.17948717948718, + "grad_norm": 11.187241554260254, + "learning_rate": 9.102564102564103e-07, + "loss": 0.2578, + "step": 2127 + }, + { + "epoch": 18.188034188034187, + "grad_norm": 2.544975757598877, + "learning_rate": 9.05982905982906e-07, + "loss": 0.0868, + "step": 2128 + }, + { + "epoch": 18.196581196581196, + "grad_norm": 2.490493059158325, + "learning_rate": 9.017094017094017e-07, + "loss": 0.1575, + "step": 2129 + }, + { + "epoch": 18.205128205128204, + "grad_norm": 4.665895938873291, + "learning_rate": 8.974358974358975e-07, + "loss": 0.1644, + "step": 2130 + }, + { + "epoch": 18.213675213675213, + "grad_norm": 3.135772943496704, + "learning_rate": 8.931623931623933e-07, + "loss": 0.205, + "step": 2131 + }, + { + "epoch": 18.22222222222222, + "grad_norm": 1.5636606216430664, + "learning_rate": 8.88888888888889e-07, + "loss": 0.0541, + "step": 2132 + }, + { + "epoch": 18.23076923076923, + "grad_norm": 3.603691816329956, + "learning_rate": 8.846153846153848e-07, + "loss": 0.0478, + "step": 2133 + }, + { + "epoch": 18.23931623931624, + "grad_norm": 2.6537222862243652, + "learning_rate": 8.803418803418804e-07, + "loss": 0.1206, + "step": 2134 + }, + { + "epoch": 18.247863247863247, + "grad_norm": 5.086421966552734, + "learning_rate": 8.760683760683761e-07, + "loss": 0.1212, + "step": 2135 + }, + { + "epoch": 18.256410256410255, + "grad_norm": 4.673394203186035, + "learning_rate": 8.717948717948718e-07, + "loss": 0.0588, + "step": 2136 + }, + { + "epoch": 18.264957264957264, + "grad_norm": 2.1376845836639404, + "learning_rate": 8.675213675213676e-07, + "loss": 0.0492, + "step": 2137 + }, + { + "epoch": 18.273504273504273, + "grad_norm": 2.8616504669189453, + "learning_rate": 8.632478632478633e-07, + "loss": 0.1834, + "step": 2138 + }, + { + "epoch": 18.28205128205128, + "grad_norm": 2.7179784774780273, + "learning_rate": 8.58974358974359e-07, + "loss": 0.1508, + "step": 2139 + }, + { + "epoch": 18.29059829059829, + "grad_norm": 1.1909416913986206, + "learning_rate": 8.547008547008548e-07, + "loss": 0.0721, + "step": 2140 + }, + { + "epoch": 18.299145299145298, + "grad_norm": 1.8272216320037842, + "learning_rate": 8.504273504273505e-07, + "loss": 0.0797, + "step": 2141 + }, + { + "epoch": 18.307692307692307, + "grad_norm": 4.394528388977051, + "learning_rate": 8.461538461538463e-07, + "loss": 0.2762, + "step": 2142 + }, + { + "epoch": 18.316239316239315, + "grad_norm": 4.276169776916504, + "learning_rate": 8.41880341880342e-07, + "loss": 0.0969, + "step": 2143 + }, + { + "epoch": 18.324786324786324, + "grad_norm": 2.0932376384735107, + "learning_rate": 8.376068376068377e-07, + "loss": 0.0595, + "step": 2144 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 5.714378833770752, + "learning_rate": 8.333333333333333e-07, + "loss": 0.1176, + "step": 2145 + }, + { + "epoch": 18.34188034188034, + "grad_norm": 1.1050394773483276, + "learning_rate": 8.290598290598291e-07, + "loss": 0.0284, + "step": 2146 + }, + { + "epoch": 18.35042735042735, + "grad_norm": 3.2809271812438965, + "learning_rate": 8.247863247863248e-07, + "loss": 0.0737, + "step": 2147 + }, + { + "epoch": 18.358974358974358, + "grad_norm": 2.102889060974121, + "learning_rate": 8.205128205128206e-07, + "loss": 0.0477, + "step": 2148 + }, + { + "epoch": 18.367521367521366, + "grad_norm": 1.5728402137756348, + "learning_rate": 8.162393162393164e-07, + "loss": 0.0476, + "step": 2149 + }, + { + "epoch": 18.376068376068375, + "grad_norm": 2.0337905883789062, + "learning_rate": 8.11965811965812e-07, + "loss": 0.019, + "step": 2150 + }, + { + "epoch": 18.384615384615383, + "grad_norm": 5.475340843200684, + "learning_rate": 8.076923076923078e-07, + "loss": 0.1625, + "step": 2151 + }, + { + "epoch": 18.39316239316239, + "grad_norm": 0.4993753135204315, + "learning_rate": 8.034188034188035e-07, + "loss": 0.0132, + "step": 2152 + }, + { + "epoch": 18.4017094017094, + "grad_norm": 4.052933216094971, + "learning_rate": 7.991452991452992e-07, + "loss": 0.1603, + "step": 2153 + }, + { + "epoch": 18.41025641025641, + "grad_norm": 3.005293607711792, + "learning_rate": 7.948717948717949e-07, + "loss": 0.0399, + "step": 2154 + }, + { + "epoch": 18.418803418803417, + "grad_norm": 3.0186731815338135, + "learning_rate": 7.905982905982906e-07, + "loss": 0.0564, + "step": 2155 + }, + { + "epoch": 18.427350427350426, + "grad_norm": 5.522226333618164, + "learning_rate": 7.863247863247863e-07, + "loss": 0.1138, + "step": 2156 + }, + { + "epoch": 18.435897435897434, + "grad_norm": 5.463916301727295, + "learning_rate": 7.820512820512821e-07, + "loss": 0.4811, + "step": 2157 + }, + { + "epoch": 18.444444444444443, + "grad_norm": 0.41404595971107483, + "learning_rate": 7.777777777777779e-07, + "loss": 0.0114, + "step": 2158 + }, + { + "epoch": 18.45299145299145, + "grad_norm": 0.9279537200927734, + "learning_rate": 7.735042735042736e-07, + "loss": 0.0268, + "step": 2159 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 0.5745738744735718, + "learning_rate": 7.692307692307694e-07, + "loss": 0.0155, + "step": 2160 + }, + { + "epoch": 18.47008547008547, + "grad_norm": 2.329507827758789, + "learning_rate": 7.64957264957265e-07, + "loss": 0.0421, + "step": 2161 + }, + { + "epoch": 18.478632478632477, + "grad_norm": 2.934424638748169, + "learning_rate": 7.606837606837607e-07, + "loss": 0.0925, + "step": 2162 + }, + { + "epoch": 18.487179487179485, + "grad_norm": 3.226261854171753, + "learning_rate": 7.564102564102564e-07, + "loss": 0.1914, + "step": 2163 + }, + { + "epoch": 18.495726495726494, + "grad_norm": 1.2033684253692627, + "learning_rate": 7.521367521367522e-07, + "loss": 0.0218, + "step": 2164 + }, + { + "epoch": 18.504273504273506, + "grad_norm": 1.092015266418457, + "learning_rate": 7.478632478632479e-07, + "loss": 0.0165, + "step": 2165 + }, + { + "epoch": 18.51282051282051, + "grad_norm": 1.2283809185028076, + "learning_rate": 7.435897435897436e-07, + "loss": 0.025, + "step": 2166 + }, + { + "epoch": 18.521367521367523, + "grad_norm": 6.3457722663879395, + "learning_rate": 7.393162393162394e-07, + "loss": 0.2224, + "step": 2167 + }, + { + "epoch": 18.52991452991453, + "grad_norm": 4.920536518096924, + "learning_rate": 7.350427350427351e-07, + "loss": 0.1381, + "step": 2168 + }, + { + "epoch": 18.53846153846154, + "grad_norm": 4.16088342666626, + "learning_rate": 7.307692307692309e-07, + "loss": 0.2725, + "step": 2169 + }, + { + "epoch": 18.54700854700855, + "grad_norm": 1.4776932001113892, + "learning_rate": 7.264957264957266e-07, + "loss": 0.0236, + "step": 2170 + }, + { + "epoch": 18.555555555555557, + "grad_norm": 5.517492294311523, + "learning_rate": 7.222222222222222e-07, + "loss": 0.3427, + "step": 2171 + }, + { + "epoch": 18.564102564102566, + "grad_norm": 0.7798398733139038, + "learning_rate": 7.179487179487179e-07, + "loss": 0.0139, + "step": 2172 + }, + { + "epoch": 18.572649572649574, + "grad_norm": 0.7174245119094849, + "learning_rate": 7.136752136752137e-07, + "loss": 0.0144, + "step": 2173 + }, + { + "epoch": 18.581196581196583, + "grad_norm": 5.118779182434082, + "learning_rate": 7.094017094017094e-07, + "loss": 0.1899, + "step": 2174 + }, + { + "epoch": 18.58974358974359, + "grad_norm": 2.8726353645324707, + "learning_rate": 7.051282051282052e-07, + "loss": 0.1177, + "step": 2175 + }, + { + "epoch": 18.5982905982906, + "grad_norm": 2.3775036334991455, + "learning_rate": 7.00854700854701e-07, + "loss": 0.1183, + "step": 2176 + }, + { + "epoch": 18.60683760683761, + "grad_norm": 19.23975944519043, + "learning_rate": 6.965811965811966e-07, + "loss": 0.4534, + "step": 2177 + }, + { + "epoch": 18.615384615384617, + "grad_norm": 1.3832803964614868, + "learning_rate": 6.923076923076924e-07, + "loss": 0.0309, + "step": 2178 + }, + { + "epoch": 18.623931623931625, + "grad_norm": 1.6752214431762695, + "learning_rate": 6.880341880341881e-07, + "loss": 0.0201, + "step": 2179 + }, + { + "epoch": 18.632478632478634, + "grad_norm": 3.1885950565338135, + "learning_rate": 6.837606837606839e-07, + "loss": 0.1242, + "step": 2180 + }, + { + "epoch": 18.641025641025642, + "grad_norm": 0.9290790557861328, + "learning_rate": 6.794871794871795e-07, + "loss": 0.0189, + "step": 2181 + }, + { + "epoch": 18.64957264957265, + "grad_norm": 0.25725051760673523, + "learning_rate": 6.752136752136752e-07, + "loss": 0.0065, + "step": 2182 + }, + { + "epoch": 18.65811965811966, + "grad_norm": 1.9815839529037476, + "learning_rate": 6.709401709401709e-07, + "loss": 0.0576, + "step": 2183 + }, + { + "epoch": 18.666666666666668, + "grad_norm": 1.924490213394165, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0671, + "step": 2184 + }, + { + "epoch": 18.675213675213676, + "grad_norm": 2.9947164058685303, + "learning_rate": 6.623931623931625e-07, + "loss": 0.1859, + "step": 2185 + }, + { + "epoch": 18.683760683760685, + "grad_norm": 1.8680211305618286, + "learning_rate": 6.581196581196582e-07, + "loss": 0.1028, + "step": 2186 + }, + { + "epoch": 18.692307692307693, + "grad_norm": 0.823103666305542, + "learning_rate": 6.53846153846154e-07, + "loss": 0.0198, + "step": 2187 + }, + { + "epoch": 18.700854700854702, + "grad_norm": 2.3616061210632324, + "learning_rate": 6.495726495726496e-07, + "loss": 0.1025, + "step": 2188 + }, + { + "epoch": 18.70940170940171, + "grad_norm": 3.1370067596435547, + "learning_rate": 6.452991452991454e-07, + "loss": 0.0438, + "step": 2189 + }, + { + "epoch": 18.71794871794872, + "grad_norm": 8.058025360107422, + "learning_rate": 6.41025641025641e-07, + "loss": 0.082, + "step": 2190 + }, + { + "epoch": 18.726495726495727, + "grad_norm": 2.1969916820526123, + "learning_rate": 6.367521367521368e-07, + "loss": 0.1074, + "step": 2191 + }, + { + "epoch": 18.735042735042736, + "grad_norm": 2.5845255851745605, + "learning_rate": 6.324786324786325e-07, + "loss": 0.0795, + "step": 2192 + }, + { + "epoch": 18.743589743589745, + "grad_norm": 3.578331708908081, + "learning_rate": 6.282051282051282e-07, + "loss": 0.1111, + "step": 2193 + }, + { + "epoch": 18.752136752136753, + "grad_norm": 1.5390626192092896, + "learning_rate": 6.23931623931624e-07, + "loss": 0.064, + "step": 2194 + }, + { + "epoch": 18.76068376068376, + "grad_norm": 3.1742804050445557, + "learning_rate": 6.196581196581197e-07, + "loss": 0.0971, + "step": 2195 + }, + { + "epoch": 18.76923076923077, + "grad_norm": 1.7017542123794556, + "learning_rate": 6.153846153846155e-07, + "loss": 0.0424, + "step": 2196 + }, + { + "epoch": 18.77777777777778, + "grad_norm": 2.642102003097534, + "learning_rate": 6.111111111111112e-07, + "loss": 0.1243, + "step": 2197 + }, + { + "epoch": 18.786324786324787, + "grad_norm": 1.2010291814804077, + "learning_rate": 6.068376068376068e-07, + "loss": 0.0375, + "step": 2198 + }, + { + "epoch": 18.794871794871796, + "grad_norm": 3.1580190658569336, + "learning_rate": 6.025641025641026e-07, + "loss": 0.0565, + "step": 2199 + }, + { + "epoch": 18.803418803418804, + "grad_norm": 2.7660391330718994, + "learning_rate": 5.982905982905984e-07, + "loss": 0.0385, + "step": 2200 + }, + { + "epoch": 18.811965811965813, + "grad_norm": 0.7716617584228516, + "learning_rate": 5.94017094017094e-07, + "loss": 0.0159, + "step": 2201 + }, + { + "epoch": 18.82051282051282, + "grad_norm": 3.190251588821411, + "learning_rate": 5.897435897435898e-07, + "loss": 0.241, + "step": 2202 + }, + { + "epoch": 18.82905982905983, + "grad_norm": 7.115220069885254, + "learning_rate": 5.854700854700856e-07, + "loss": 0.1777, + "step": 2203 + }, + { + "epoch": 18.837606837606838, + "grad_norm": 5.071573257446289, + "learning_rate": 5.811965811965812e-07, + "loss": 0.5421, + "step": 2204 + }, + { + "epoch": 18.846153846153847, + "grad_norm": 3.8419785499572754, + "learning_rate": 5.76923076923077e-07, + "loss": 0.0784, + "step": 2205 + }, + { + "epoch": 18.854700854700855, + "grad_norm": 2.8234896659851074, + "learning_rate": 5.726495726495727e-07, + "loss": 0.1071, + "step": 2206 + }, + { + "epoch": 18.863247863247864, + "grad_norm": 1.4067480564117432, + "learning_rate": 5.683760683760684e-07, + "loss": 0.0375, + "step": 2207 + }, + { + "epoch": 18.871794871794872, + "grad_norm": 2.508589029312134, + "learning_rate": 5.641025641025642e-07, + "loss": 0.0921, + "step": 2208 + }, + { + "epoch": 18.88034188034188, + "grad_norm": 7.314038276672363, + "learning_rate": 5.598290598290599e-07, + "loss": 0.3581, + "step": 2209 + }, + { + "epoch": 18.88888888888889, + "grad_norm": 4.375041961669922, + "learning_rate": 5.555555555555555e-07, + "loss": 0.1115, + "step": 2210 + }, + { + "epoch": 18.897435897435898, + "grad_norm": 4.789741516113281, + "learning_rate": 5.512820512820513e-07, + "loss": 0.1813, + "step": 2211 + }, + { + "epoch": 18.905982905982906, + "grad_norm": 3.008720874786377, + "learning_rate": 5.470085470085471e-07, + "loss": 0.104, + "step": 2212 + }, + { + "epoch": 18.914529914529915, + "grad_norm": 0.6364433765411377, + "learning_rate": 5.427350427350428e-07, + "loss": 0.0153, + "step": 2213 + }, + { + "epoch": 18.923076923076923, + "grad_norm": 1.4009958505630493, + "learning_rate": 5.384615384615386e-07, + "loss": 0.0499, + "step": 2214 + }, + { + "epoch": 18.931623931623932, + "grad_norm": 4.53135347366333, + "learning_rate": 5.341880341880342e-07, + "loss": 0.1021, + "step": 2215 + }, + { + "epoch": 18.94017094017094, + "grad_norm": 0.7855163216590881, + "learning_rate": 5.299145299145299e-07, + "loss": 0.0297, + "step": 2216 + }, + { + "epoch": 18.94871794871795, + "grad_norm": 1.5316343307495117, + "learning_rate": 5.256410256410257e-07, + "loss": 0.0438, + "step": 2217 + }, + { + "epoch": 18.957264957264957, + "grad_norm": 1.2713849544525146, + "learning_rate": 5.213675213675215e-07, + "loss": 0.0311, + "step": 2218 + }, + { + "epoch": 18.965811965811966, + "grad_norm": 1.612418293952942, + "learning_rate": 5.17094017094017e-07, + "loss": 0.0796, + "step": 2219 + }, + { + "epoch": 18.974358974358974, + "grad_norm": 6.046596527099609, + "learning_rate": 5.128205128205128e-07, + "loss": 0.0835, + "step": 2220 + }, + { + "epoch": 18.982905982905983, + "grad_norm": 2.527993679046631, + "learning_rate": 5.085470085470086e-07, + "loss": 0.0448, + "step": 2221 + }, + { + "epoch": 18.99145299145299, + "grad_norm": 0.9519897699356079, + "learning_rate": 5.042735042735043e-07, + "loss": 0.0223, + "step": 2222 + }, + { + "epoch": 19.0, + "grad_norm": 14.08708667755127, + "learning_rate": 5.000000000000001e-07, + "loss": 0.6753, + "step": 2223 + }, + { + "epoch": 19.0, + "eval_loss": 0.05170569196343422, + "eval_runtime": 9.3972, + "eval_samples_per_second": 49.589, + "eval_steps_per_second": 6.278, + "step": 2223 + }, + { + "epoch": 19.00854700854701, + "grad_norm": 5.215019702911377, + "learning_rate": 4.957264957264958e-07, + "loss": 0.1614, + "step": 2224 + }, + { + "epoch": 19.017094017094017, + "grad_norm": 2.855567216873169, + "learning_rate": 4.914529914529914e-07, + "loss": 0.1051, + "step": 2225 + }, + { + "epoch": 19.025641025641026, + "grad_norm": 4.078762054443359, + "learning_rate": 4.871794871794872e-07, + "loss": 0.2859, + "step": 2226 + }, + { + "epoch": 19.034188034188034, + "grad_norm": 0.9259152412414551, + "learning_rate": 4.82905982905983e-07, + "loss": 0.0257, + "step": 2227 + }, + { + "epoch": 19.042735042735043, + "grad_norm": 3.629925012588501, + "learning_rate": 4.786324786324787e-07, + "loss": 0.1283, + "step": 2228 + }, + { + "epoch": 19.05128205128205, + "grad_norm": 3.104196310043335, + "learning_rate": 4.7435897435897437e-07, + "loss": 0.0701, + "step": 2229 + }, + { + "epoch": 19.05982905982906, + "grad_norm": 8.760592460632324, + "learning_rate": 4.700854700854701e-07, + "loss": 0.5793, + "step": 2230 + }, + { + "epoch": 19.068376068376068, + "grad_norm": 1.2966917753219604, + "learning_rate": 4.658119658119659e-07, + "loss": 0.0573, + "step": 2231 + }, + { + "epoch": 19.076923076923077, + "grad_norm": 1.7045038938522339, + "learning_rate": 4.615384615384616e-07, + "loss": 0.0497, + "step": 2232 + }, + { + "epoch": 19.085470085470085, + "grad_norm": 7.805142402648926, + "learning_rate": 4.572649572649573e-07, + "loss": 0.2898, + "step": 2233 + }, + { + "epoch": 19.094017094017094, + "grad_norm": 0.5019100308418274, + "learning_rate": 4.52991452991453e-07, + "loss": 0.0132, + "step": 2234 + }, + { + "epoch": 19.102564102564102, + "grad_norm": 3.1100540161132812, + "learning_rate": 4.4871794871794876e-07, + "loss": 0.0874, + "step": 2235 + }, + { + "epoch": 19.11111111111111, + "grad_norm": 0.40422680974006653, + "learning_rate": 4.444444444444445e-07, + "loss": 0.012, + "step": 2236 + }, + { + "epoch": 19.11965811965812, + "grad_norm": 1.2845938205718994, + "learning_rate": 4.401709401709402e-07, + "loss": 0.0259, + "step": 2237 + }, + { + "epoch": 19.128205128205128, + "grad_norm": 4.621537208557129, + "learning_rate": 4.358974358974359e-07, + "loss": 0.246, + "step": 2238 + }, + { + "epoch": 19.136752136752136, + "grad_norm": 1.1688278913497925, + "learning_rate": 4.3162393162393163e-07, + "loss": 0.0804, + "step": 2239 + }, + { + "epoch": 19.145299145299145, + "grad_norm": 10.896872520446777, + "learning_rate": 4.273504273504274e-07, + "loss": 0.2695, + "step": 2240 + }, + { + "epoch": 19.153846153846153, + "grad_norm": 2.7485415935516357, + "learning_rate": 4.2307692307692315e-07, + "loss": 0.0474, + "step": 2241 + }, + { + "epoch": 19.162393162393162, + "grad_norm": 1.1686739921569824, + "learning_rate": 4.188034188034188e-07, + "loss": 0.0257, + "step": 2242 + }, + { + "epoch": 19.17094017094017, + "grad_norm": 3.5579254627227783, + "learning_rate": 4.1452991452991456e-07, + "loss": 0.0419, + "step": 2243 + }, + { + "epoch": 19.17948717948718, + "grad_norm": 3.088649034500122, + "learning_rate": 4.102564102564103e-07, + "loss": 0.1229, + "step": 2244 + }, + { + "epoch": 19.188034188034187, + "grad_norm": 1.4894665479660034, + "learning_rate": 4.05982905982906e-07, + "loss": 0.0414, + "step": 2245 + }, + { + "epoch": 19.196581196581196, + "grad_norm": 5.022091865539551, + "learning_rate": 4.0170940170940175e-07, + "loss": 0.1423, + "step": 2246 + }, + { + "epoch": 19.205128205128204, + "grad_norm": 1.6117054224014282, + "learning_rate": 3.9743589743589743e-07, + "loss": 0.0244, + "step": 2247 + }, + { + "epoch": 19.213675213675213, + "grad_norm": 0.5429085493087769, + "learning_rate": 3.9316239316239316e-07, + "loss": 0.0122, + "step": 2248 + }, + { + "epoch": 19.22222222222222, + "grad_norm": 7.429282188415527, + "learning_rate": 3.8888888888888895e-07, + "loss": 0.122, + "step": 2249 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 4.492022514343262, + "learning_rate": 3.846153846153847e-07, + "loss": 0.3181, + "step": 2250 + }, + { + "epoch": 19.23931623931624, + "grad_norm": 5.219499588012695, + "learning_rate": 3.8034188034188036e-07, + "loss": 0.1374, + "step": 2251 + }, + { + "epoch": 19.247863247863247, + "grad_norm": 3.454345941543579, + "learning_rate": 3.760683760683761e-07, + "loss": 0.147, + "step": 2252 + }, + { + "epoch": 19.256410256410255, + "grad_norm": 0.6370477080345154, + "learning_rate": 3.717948717948718e-07, + "loss": 0.0154, + "step": 2253 + }, + { + "epoch": 19.264957264957264, + "grad_norm": 1.7189971208572388, + "learning_rate": 3.6752136752136755e-07, + "loss": 0.0635, + "step": 2254 + }, + { + "epoch": 19.273504273504273, + "grad_norm": 2.716744899749756, + "learning_rate": 3.632478632478633e-07, + "loss": 0.0966, + "step": 2255 + }, + { + "epoch": 19.28205128205128, + "grad_norm": 2.4959864616394043, + "learning_rate": 3.5897435897435896e-07, + "loss": 0.0779, + "step": 2256 + }, + { + "epoch": 19.29059829059829, + "grad_norm": 3.625793218612671, + "learning_rate": 3.547008547008547e-07, + "loss": 0.3238, + "step": 2257 + }, + { + "epoch": 19.299145299145298, + "grad_norm": 1.8783844709396362, + "learning_rate": 3.504273504273505e-07, + "loss": 0.0319, + "step": 2258 + }, + { + "epoch": 19.307692307692307, + "grad_norm": 1.6740922927856445, + "learning_rate": 3.461538461538462e-07, + "loss": 0.0844, + "step": 2259 + }, + { + "epoch": 19.316239316239315, + "grad_norm": 2.8891098499298096, + "learning_rate": 3.4188034188034194e-07, + "loss": 0.1916, + "step": 2260 + }, + { + "epoch": 19.324786324786324, + "grad_norm": 0.9975456595420837, + "learning_rate": 3.376068376068376e-07, + "loss": 0.0266, + "step": 2261 + }, + { + "epoch": 19.333333333333332, + "grad_norm": 2.576789379119873, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0722, + "step": 2262 + }, + { + "epoch": 19.34188034188034, + "grad_norm": 9.070858001708984, + "learning_rate": 3.290598290598291e-07, + "loss": 0.2998, + "step": 2263 + }, + { + "epoch": 19.35042735042735, + "grad_norm": 3.052319049835205, + "learning_rate": 3.247863247863248e-07, + "loss": 0.0435, + "step": 2264 + }, + { + "epoch": 19.358974358974358, + "grad_norm": 0.8035821318626404, + "learning_rate": 3.205128205128205e-07, + "loss": 0.0233, + "step": 2265 + }, + { + "epoch": 19.367521367521366, + "grad_norm": 3.7658371925354004, + "learning_rate": 3.1623931623931623e-07, + "loss": 0.3007, + "step": 2266 + }, + { + "epoch": 19.376068376068375, + "grad_norm": 1.210494875907898, + "learning_rate": 3.11965811965812e-07, + "loss": 0.0344, + "step": 2267 + }, + { + "epoch": 19.384615384615383, + "grad_norm": 1.1121772527694702, + "learning_rate": 3.0769230769230774e-07, + "loss": 0.054, + "step": 2268 + }, + { + "epoch": 19.39316239316239, + "grad_norm": 2.842228412628174, + "learning_rate": 3.034188034188034e-07, + "loss": 0.0814, + "step": 2269 + }, + { + "epoch": 19.4017094017094, + "grad_norm": 1.9269556999206543, + "learning_rate": 2.991452991452992e-07, + "loss": 0.0354, + "step": 2270 + }, + { + "epoch": 19.41025641025641, + "grad_norm": 7.359715938568115, + "learning_rate": 2.948717948717949e-07, + "loss": 0.3288, + "step": 2271 + }, + { + "epoch": 19.418803418803417, + "grad_norm": 1.7621564865112305, + "learning_rate": 2.905982905982906e-07, + "loss": 0.0313, + "step": 2272 + }, + { + "epoch": 19.427350427350426, + "grad_norm": 2.5410284996032715, + "learning_rate": 2.8632478632478635e-07, + "loss": 0.076, + "step": 2273 + }, + { + "epoch": 19.435897435897434, + "grad_norm": 5.633874416351318, + "learning_rate": 2.820512820512821e-07, + "loss": 0.1903, + "step": 2274 + }, + { + "epoch": 19.444444444444443, + "grad_norm": 1.935703158378601, + "learning_rate": 2.7777777777777776e-07, + "loss": 0.3778, + "step": 2275 + }, + { + "epoch": 19.45299145299145, + "grad_norm": 7.559366703033447, + "learning_rate": 2.7350427350427354e-07, + "loss": 0.2684, + "step": 2276 + }, + { + "epoch": 19.46153846153846, + "grad_norm": 9.240869522094727, + "learning_rate": 2.692307692307693e-07, + "loss": 0.2982, + "step": 2277 + }, + { + "epoch": 19.47008547008547, + "grad_norm": 6.940350532531738, + "learning_rate": 2.6495726495726495e-07, + "loss": 0.3131, + "step": 2278 + }, + { + "epoch": 19.478632478632477, + "grad_norm": 1.3201594352722168, + "learning_rate": 2.6068376068376074e-07, + "loss": 0.0191, + "step": 2279 + }, + { + "epoch": 19.487179487179485, + "grad_norm": 1.626806616783142, + "learning_rate": 2.564102564102564e-07, + "loss": 0.0361, + "step": 2280 + }, + { + "epoch": 19.495726495726494, + "grad_norm": 8.687582969665527, + "learning_rate": 2.5213675213675215e-07, + "loss": 0.1942, + "step": 2281 + }, + { + "epoch": 19.504273504273506, + "grad_norm": 5.104561805725098, + "learning_rate": 2.478632478632479e-07, + "loss": 0.1906, + "step": 2282 + }, + { + "epoch": 19.51282051282051, + "grad_norm": 2.8611207008361816, + "learning_rate": 2.435897435897436e-07, + "loss": 0.1258, + "step": 2283 + }, + { + "epoch": 19.521367521367523, + "grad_norm": 1.2258422374725342, + "learning_rate": 2.3931623931623934e-07, + "loss": 0.0186, + "step": 2284 + }, + { + "epoch": 19.52991452991453, + "grad_norm": 5.307450294494629, + "learning_rate": 2.3504273504273505e-07, + "loss": 0.1356, + "step": 2285 + }, + { + "epoch": 19.53846153846154, + "grad_norm": 2.0854647159576416, + "learning_rate": 2.307692307692308e-07, + "loss": 0.0533, + "step": 2286 + }, + { + "epoch": 19.54700854700855, + "grad_norm": 1.8560184240341187, + "learning_rate": 2.264957264957265e-07, + "loss": 0.048, + "step": 2287 + }, + { + "epoch": 19.555555555555557, + "grad_norm": 5.781933307647705, + "learning_rate": 2.2222222222222224e-07, + "loss": 0.2769, + "step": 2288 + }, + { + "epoch": 19.564102564102566, + "grad_norm": 4.858759880065918, + "learning_rate": 2.1794871794871795e-07, + "loss": 0.4217, + "step": 2289 + }, + { + "epoch": 19.572649572649574, + "grad_norm": 3.7598235607147217, + "learning_rate": 2.136752136752137e-07, + "loss": 0.162, + "step": 2290 + }, + { + "epoch": 19.581196581196583, + "grad_norm": 0.5706556439399719, + "learning_rate": 2.094017094017094e-07, + "loss": 0.0151, + "step": 2291 + }, + { + "epoch": 19.58974358974359, + "grad_norm": 5.697900295257568, + "learning_rate": 2.0512820512820514e-07, + "loss": 0.1015, + "step": 2292 + }, + { + "epoch": 19.5982905982906, + "grad_norm": 4.635442733764648, + "learning_rate": 2.0085470085470088e-07, + "loss": 0.1827, + "step": 2293 + }, + { + "epoch": 19.60683760683761, + "grad_norm": 3.070131778717041, + "learning_rate": 1.9658119658119658e-07, + "loss": 0.0802, + "step": 2294 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 0.979217529296875, + "learning_rate": 1.9230769230769234e-07, + "loss": 0.0237, + "step": 2295 + }, + { + "epoch": 19.623931623931625, + "grad_norm": 5.640648365020752, + "learning_rate": 1.8803418803418804e-07, + "loss": 0.0588, + "step": 2296 + }, + { + "epoch": 19.632478632478634, + "grad_norm": 7.1512861251831055, + "learning_rate": 1.8376068376068378e-07, + "loss": 0.1942, + "step": 2297 + }, + { + "epoch": 19.641025641025642, + "grad_norm": 12.868803024291992, + "learning_rate": 1.7948717948717948e-07, + "loss": 0.2771, + "step": 2298 + }, + { + "epoch": 19.64957264957265, + "grad_norm": 2.954000234603882, + "learning_rate": 1.7521367521367524e-07, + "loss": 0.1124, + "step": 2299 + }, + { + "epoch": 19.65811965811966, + "grad_norm": 0.47206825017929077, + "learning_rate": 1.7094017094017097e-07, + "loss": 0.0104, + "step": 2300 + }, + { + "epoch": 19.666666666666668, + "grad_norm": 0.6243001818656921, + "learning_rate": 1.6666666666666668e-07, + "loss": 0.0145, + "step": 2301 + }, + { + "epoch": 19.675213675213676, + "grad_norm": 1.6680350303649902, + "learning_rate": 1.623931623931624e-07, + "loss": 0.0634, + "step": 2302 + }, + { + "epoch": 19.683760683760685, + "grad_norm": 6.298573017120361, + "learning_rate": 1.5811965811965811e-07, + "loss": 0.2083, + "step": 2303 + }, + { + "epoch": 19.692307692307693, + "grad_norm": 0.622466504573822, + "learning_rate": 1.5384615384615387e-07, + "loss": 0.0155, + "step": 2304 + }, + { + "epoch": 19.700854700854702, + "grad_norm": 2.289080858230591, + "learning_rate": 1.495726495726496e-07, + "loss": 0.0698, + "step": 2305 + }, + { + "epoch": 19.70940170940171, + "grad_norm": 13.065472602844238, + "learning_rate": 1.452991452991453e-07, + "loss": 0.2587, + "step": 2306 + }, + { + "epoch": 19.71794871794872, + "grad_norm": 0.903513491153717, + "learning_rate": 1.4102564102564104e-07, + "loss": 0.0222, + "step": 2307 + }, + { + "epoch": 19.726495726495727, + "grad_norm": 1.3763283491134644, + "learning_rate": 1.3675213675213677e-07, + "loss": 0.042, + "step": 2308 + }, + { + "epoch": 19.735042735042736, + "grad_norm": 3.3493802547454834, + "learning_rate": 1.3247863247863248e-07, + "loss": 0.1042, + "step": 2309 + }, + { + "epoch": 19.743589743589745, + "grad_norm": 12.862226486206055, + "learning_rate": 1.282051282051282e-07, + "loss": 0.359, + "step": 2310 + }, + { + "epoch": 19.752136752136753, + "grad_norm": 5.56069278717041, + "learning_rate": 1.2393162393162394e-07, + "loss": 0.1645, + "step": 2311 + }, + { + "epoch": 19.76068376068376, + "grad_norm": 2.900381326675415, + "learning_rate": 1.1965811965811967e-07, + "loss": 0.1641, + "step": 2312 + }, + { + "epoch": 19.76923076923077, + "grad_norm": 1.3674333095550537, + "learning_rate": 1.153846153846154e-07, + "loss": 0.0428, + "step": 2313 + }, + { + "epoch": 19.77777777777778, + "grad_norm": 2.06278657913208, + "learning_rate": 1.1111111111111112e-07, + "loss": 0.0404, + "step": 2314 + }, + { + "epoch": 19.786324786324787, + "grad_norm": 5.760499954223633, + "learning_rate": 1.0683760683760685e-07, + "loss": 0.1298, + "step": 2315 + }, + { + "epoch": 19.794871794871796, + "grad_norm": 3.2554516792297363, + "learning_rate": 1.0256410256410257e-07, + "loss": 0.0432, + "step": 2316 + }, + { + "epoch": 19.803418803418804, + "grad_norm": 1.7984355688095093, + "learning_rate": 9.829059829059829e-08, + "loss": 0.0461, + "step": 2317 + }, + { + "epoch": 19.811965811965813, + "grad_norm": 1.633736491203308, + "learning_rate": 9.401709401709402e-08, + "loss": 0.0746, + "step": 2318 + }, + { + "epoch": 19.82051282051282, + "grad_norm": 2.6958866119384766, + "learning_rate": 8.974358974358974e-08, + "loss": 0.0852, + "step": 2319 + }, + { + "epoch": 19.82905982905983, + "grad_norm": 0.9744161367416382, + "learning_rate": 8.547008547008549e-08, + "loss": 0.0368, + "step": 2320 + }, + { + "epoch": 19.837606837606838, + "grad_norm": 1.2404037714004517, + "learning_rate": 8.11965811965812e-08, + "loss": 0.0547, + "step": 2321 + }, + { + "epoch": 19.846153846153847, + "grad_norm": 1.6044564247131348, + "learning_rate": 7.692307692307694e-08, + "loss": 0.0441, + "step": 2322 + }, + { + "epoch": 19.854700854700855, + "grad_norm": 0.47167596220970154, + "learning_rate": 7.264957264957265e-08, + "loss": 0.0099, + "step": 2323 + }, + { + "epoch": 19.863247863247864, + "grad_norm": 1.6729376316070557, + "learning_rate": 6.837606837606839e-08, + "loss": 0.0258, + "step": 2324 + }, + { + "epoch": 19.871794871794872, + "grad_norm": 0.5823857188224792, + "learning_rate": 6.41025641025641e-08, + "loss": 0.0131, + "step": 2325 + }, + { + "epoch": 19.88034188034188, + "grad_norm": 4.055545806884766, + "learning_rate": 5.982905982905984e-08, + "loss": 0.073, + "step": 2326 + }, + { + "epoch": 19.88888888888889, + "grad_norm": 2.693838596343994, + "learning_rate": 5.555555555555556e-08, + "loss": 0.0845, + "step": 2327 + }, + { + "epoch": 19.897435897435898, + "grad_norm": 0.9895898103713989, + "learning_rate": 5.1282051282051286e-08, + "loss": 0.0205, + "step": 2328 + }, + { + "epoch": 19.905982905982906, + "grad_norm": 3.560816526412964, + "learning_rate": 4.700854700854701e-08, + "loss": 0.0989, + "step": 2329 + }, + { + "epoch": 19.914529914529915, + "grad_norm": 5.152528762817383, + "learning_rate": 4.273504273504274e-08, + "loss": 0.0133, + "step": 2330 + }, + { + "epoch": 19.923076923076923, + "grad_norm": 1.709021806716919, + "learning_rate": 3.846153846153847e-08, + "loss": 0.068, + "step": 2331 + }, + { + "epoch": 19.931623931623932, + "grad_norm": 0.4786951541900635, + "learning_rate": 3.418803418803419e-08, + "loss": 0.0141, + "step": 2332 + }, + { + "epoch": 19.94017094017094, + "grad_norm": 1.5413727760314941, + "learning_rate": 2.991452991452992e-08, + "loss": 0.0246, + "step": 2333 + }, + { + "epoch": 19.94871794871795, + "grad_norm": 1.019601583480835, + "learning_rate": 2.5641025641025643e-08, + "loss": 0.0199, + "step": 2334 + }, + { + "epoch": 19.957264957264957, + "grad_norm": 1.6115524768829346, + "learning_rate": 2.136752136752137e-08, + "loss": 0.0752, + "step": 2335 + }, + { + "epoch": 19.965811965811966, + "grad_norm": 2.381624698638916, + "learning_rate": 1.7094017094017096e-08, + "loss": 0.0609, + "step": 2336 + }, + { + "epoch": 19.974358974358974, + "grad_norm": 1.688704013824463, + "learning_rate": 1.2820512820512822e-08, + "loss": 0.0419, + "step": 2337 + }, + { + "epoch": 19.982905982905983, + "grad_norm": 1.643002986907959, + "learning_rate": 8.547008547008548e-09, + "loss": 0.0456, + "step": 2338 + }, + { + "epoch": 19.99145299145299, + "grad_norm": 3.5371882915496826, + "learning_rate": 4.273504273504274e-09, + "loss": 0.0392, + "step": 2339 + }, + { + "epoch": 20.0, + "grad_norm": 4.692568302154541, + "learning_rate": 0.0, + "loss": 0.1751, + "step": 2340 + }, + { + "epoch": 20.0, + "eval_loss": 0.051427390426397324, + "eval_runtime": 9.301, + "eval_samples_per_second": 50.102, + "eval_steps_per_second": 6.343, + "step": 2340 + } + ], + "logging_steps": 1, + "max_steps": 2340, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 560912565657600.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}