{ "best_metric": 0.051427390426397324, "best_model_checkpoint": "time_base/checkpoint-2340", "epoch": 20.0, "eval_steps": 500, "global_step": 2340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008547008547008548, "grad_norm": 221.6373748779297, "learning_rate": 9.995726495726496e-06, "loss": 37.5765, "step": 1 }, { "epoch": 0.017094017094017096, "grad_norm": 219.50563049316406, "learning_rate": 9.991452991452993e-06, "loss": 38.6173, "step": 2 }, { "epoch": 0.02564102564102564, "grad_norm": 180.23829650878906, "learning_rate": 9.987179487179488e-06, "loss": 40.3853, "step": 3 }, { "epoch": 0.03418803418803419, "grad_norm": 166.3365478515625, "learning_rate": 9.982905982905984e-06, "loss": 35.9724, "step": 4 }, { "epoch": 0.042735042735042736, "grad_norm": 199.6571044921875, "learning_rate": 9.97863247863248e-06, "loss": 35.0186, "step": 5 }, { "epoch": 0.05128205128205128, "grad_norm": 180.9748992919922, "learning_rate": 9.974358974358974e-06, "loss": 39.3679, "step": 6 }, { "epoch": 0.05982905982905983, "grad_norm": 200.05496215820312, "learning_rate": 9.970085470085471e-06, "loss": 37.1519, "step": 7 }, { "epoch": 0.06837606837606838, "grad_norm": 154.3177032470703, "learning_rate": 9.965811965811966e-06, "loss": 33.9309, "step": 8 }, { "epoch": 0.07692307692307693, "grad_norm": 198.05914306640625, "learning_rate": 9.961538461538463e-06, "loss": 34.8814, "step": 9 }, { "epoch": 0.08547008547008547, "grad_norm": 168.3035430908203, "learning_rate": 9.957264957264958e-06, "loss": 33.184, "step": 10 }, { "epoch": 0.09401709401709402, "grad_norm": 201.83705139160156, "learning_rate": 9.952991452991455e-06, "loss": 35.4025, "step": 11 }, { "epoch": 0.10256410256410256, "grad_norm": 224.4587860107422, "learning_rate": 9.94871794871795e-06, "loss": 39.222, "step": 12 }, { "epoch": 0.1111111111111111, "grad_norm": 192.1949005126953, "learning_rate": 9.944444444444445e-06, "loss": 37.1982, "step": 13 }, { "epoch": 0.11965811965811966, "grad_norm": 193.05662536621094, "learning_rate": 9.940170940170942e-06, "loss": 38.1325, "step": 14 }, { "epoch": 0.1282051282051282, "grad_norm": 150.61575317382812, "learning_rate": 9.935897435897437e-06, "loss": 34.8682, "step": 15 }, { "epoch": 0.13675213675213677, "grad_norm": 170.1510772705078, "learning_rate": 9.931623931623933e-06, "loss": 33.3652, "step": 16 }, { "epoch": 0.1452991452991453, "grad_norm": 193.86875915527344, "learning_rate": 9.927350427350428e-06, "loss": 35.0785, "step": 17 }, { "epoch": 0.15384615384615385, "grad_norm": 164.41986083984375, "learning_rate": 9.923076923076923e-06, "loss": 31.9719, "step": 18 }, { "epoch": 0.1623931623931624, "grad_norm": 166.08953857421875, "learning_rate": 9.91880341880342e-06, "loss": 34.5398, "step": 19 }, { "epoch": 0.17094017094017094, "grad_norm": 152.2139892578125, "learning_rate": 9.914529914529915e-06, "loss": 36.9092, "step": 20 }, { "epoch": 0.1794871794871795, "grad_norm": 198.23095703125, "learning_rate": 9.910256410256412e-06, "loss": 35.6744, "step": 21 }, { "epoch": 0.18803418803418803, "grad_norm": 174.7784881591797, "learning_rate": 9.905982905982907e-06, "loss": 32.8258, "step": 22 }, { "epoch": 0.19658119658119658, "grad_norm": 133.69859313964844, "learning_rate": 9.901709401709402e-06, "loss": 31.431, "step": 23 }, { "epoch": 0.20512820512820512, "grad_norm": 217.17169189453125, "learning_rate": 9.897435897435899e-06, "loss": 38.5649, "step": 24 }, { "epoch": 0.21367521367521367, "grad_norm": 172.4914093017578, "learning_rate": 9.893162393162394e-06, "loss": 33.9858, "step": 25 }, { "epoch": 0.2222222222222222, "grad_norm": 186.39654541015625, "learning_rate": 9.88888888888889e-06, "loss": 32.8029, "step": 26 }, { "epoch": 0.23076923076923078, "grad_norm": 183.65159606933594, "learning_rate": 9.884615384615386e-06, "loss": 35.8633, "step": 27 }, { "epoch": 0.23931623931623933, "grad_norm": 228.352294921875, "learning_rate": 9.880341880341882e-06, "loss": 35.0285, "step": 28 }, { "epoch": 0.24786324786324787, "grad_norm": 156.77906799316406, "learning_rate": 9.876068376068377e-06, "loss": 29.2608, "step": 29 }, { "epoch": 0.2564102564102564, "grad_norm": 232.8336181640625, "learning_rate": 9.871794871794872e-06, "loss": 35.0349, "step": 30 }, { "epoch": 0.26495726495726496, "grad_norm": 248.63247680664062, "learning_rate": 9.86752136752137e-06, "loss": 34.5067, "step": 31 }, { "epoch": 0.27350427350427353, "grad_norm": 183.5840606689453, "learning_rate": 9.863247863247864e-06, "loss": 30.4758, "step": 32 }, { "epoch": 0.28205128205128205, "grad_norm": 160.54530334472656, "learning_rate": 9.858974358974361e-06, "loss": 31.7959, "step": 33 }, { "epoch": 0.2905982905982906, "grad_norm": 199.88156127929688, "learning_rate": 9.854700854700856e-06, "loss": 35.6482, "step": 34 }, { "epoch": 0.29914529914529914, "grad_norm": 272.9530029296875, "learning_rate": 9.850427350427351e-06, "loss": 33.0804, "step": 35 }, { "epoch": 0.3076923076923077, "grad_norm": 200.0990447998047, "learning_rate": 9.846153846153848e-06, "loss": 33.2675, "step": 36 }, { "epoch": 0.3162393162393162, "grad_norm": 202.014404296875, "learning_rate": 9.841880341880343e-06, "loss": 30.8991, "step": 37 }, { "epoch": 0.3247863247863248, "grad_norm": 181.14865112304688, "learning_rate": 9.837606837606838e-06, "loss": 32.3643, "step": 38 }, { "epoch": 0.3333333333333333, "grad_norm": 134.43423461914062, "learning_rate": 9.833333333333333e-06, "loss": 30.8094, "step": 39 }, { "epoch": 0.3418803418803419, "grad_norm": 155.96640014648438, "learning_rate": 9.82905982905983e-06, "loss": 31.7564, "step": 40 }, { "epoch": 0.3504273504273504, "grad_norm": 146.9285888671875, "learning_rate": 9.824786324786325e-06, "loss": 31.9905, "step": 41 }, { "epoch": 0.358974358974359, "grad_norm": 159.67974853515625, "learning_rate": 9.820512820512821e-06, "loss": 32.5029, "step": 42 }, { "epoch": 0.36752136752136755, "grad_norm": 172.4975128173828, "learning_rate": 9.816239316239316e-06, "loss": 31.2049, "step": 43 }, { "epoch": 0.37606837606837606, "grad_norm": 148.97573852539062, "learning_rate": 9.811965811965812e-06, "loss": 27.1673, "step": 44 }, { "epoch": 0.38461538461538464, "grad_norm": 115.93009185791016, "learning_rate": 9.807692307692308e-06, "loss": 30.3342, "step": 45 }, { "epoch": 0.39316239316239315, "grad_norm": 184.13145446777344, "learning_rate": 9.803418803418803e-06, "loss": 32.317, "step": 46 }, { "epoch": 0.4017094017094017, "grad_norm": 139.3995361328125, "learning_rate": 9.7991452991453e-06, "loss": 29.9643, "step": 47 }, { "epoch": 0.41025641025641024, "grad_norm": 184.97996520996094, "learning_rate": 9.794871794871795e-06, "loss": 30.6427, "step": 48 }, { "epoch": 0.4188034188034188, "grad_norm": 120.04417419433594, "learning_rate": 9.790598290598292e-06, "loss": 26.9772, "step": 49 }, { "epoch": 0.42735042735042733, "grad_norm": 183.2873077392578, "learning_rate": 9.786324786324787e-06, "loss": 31.6688, "step": 50 }, { "epoch": 0.4358974358974359, "grad_norm": 206.44898986816406, "learning_rate": 9.782051282051282e-06, "loss": 32.0574, "step": 51 }, { "epoch": 0.4444444444444444, "grad_norm": 180.7601318359375, "learning_rate": 9.777777777777779e-06, "loss": 31.2178, "step": 52 }, { "epoch": 0.452991452991453, "grad_norm": 150.44012451171875, "learning_rate": 9.773504273504274e-06, "loss": 29.9826, "step": 53 }, { "epoch": 0.46153846153846156, "grad_norm": 119.02840423583984, "learning_rate": 9.76923076923077e-06, "loss": 26.876, "step": 54 }, { "epoch": 0.4700854700854701, "grad_norm": 164.58209228515625, "learning_rate": 9.764957264957265e-06, "loss": 28.1059, "step": 55 }, { "epoch": 0.47863247863247865, "grad_norm": 160.416259765625, "learning_rate": 9.76068376068376e-06, "loss": 28.7022, "step": 56 }, { "epoch": 0.48717948717948717, "grad_norm": 177.29747009277344, "learning_rate": 9.756410256410257e-06, "loss": 30.7275, "step": 57 }, { "epoch": 0.49572649572649574, "grad_norm": 153.59686279296875, "learning_rate": 9.752136752136752e-06, "loss": 28.5575, "step": 58 }, { "epoch": 0.5042735042735043, "grad_norm": 155.79617309570312, "learning_rate": 9.747863247863249e-06, "loss": 28.1139, "step": 59 }, { "epoch": 0.5128205128205128, "grad_norm": 173.02581787109375, "learning_rate": 9.743589743589744e-06, "loss": 30.4744, "step": 60 }, { "epoch": 0.5213675213675214, "grad_norm": 125.31639862060547, "learning_rate": 9.739316239316239e-06, "loss": 26.5559, "step": 61 }, { "epoch": 0.5299145299145299, "grad_norm": 149.00302124023438, "learning_rate": 9.735042735042736e-06, "loss": 30.4065, "step": 62 }, { "epoch": 0.5384615384615384, "grad_norm": 101.76395416259766, "learning_rate": 9.730769230769231e-06, "loss": 25.8895, "step": 63 }, { "epoch": 0.5470085470085471, "grad_norm": 134.40159606933594, "learning_rate": 9.726495726495728e-06, "loss": 26.9317, "step": 64 }, { "epoch": 0.5555555555555556, "grad_norm": 151.01914978027344, "learning_rate": 9.722222222222223e-06, "loss": 27.9913, "step": 65 }, { "epoch": 0.5641025641025641, "grad_norm": 124.92068481445312, "learning_rate": 9.71794871794872e-06, "loss": 26.7874, "step": 66 }, { "epoch": 0.5726495726495726, "grad_norm": 131.29762268066406, "learning_rate": 9.713675213675214e-06, "loss": 27.4047, "step": 67 }, { "epoch": 0.5811965811965812, "grad_norm": 154.37120056152344, "learning_rate": 9.70940170940171e-06, "loss": 26.6812, "step": 68 }, { "epoch": 0.5897435897435898, "grad_norm": 86.31095886230469, "learning_rate": 9.705128205128206e-06, "loss": 22.9869, "step": 69 }, { "epoch": 0.5982905982905983, "grad_norm": 224.42613220214844, "learning_rate": 9.700854700854701e-06, "loss": 28.4812, "step": 70 }, { "epoch": 0.6068376068376068, "grad_norm": 156.15228271484375, "learning_rate": 9.696581196581198e-06, "loss": 26.1761, "step": 71 }, { "epoch": 0.6153846153846154, "grad_norm": 117.7806167602539, "learning_rate": 9.692307692307693e-06, "loss": 20.7307, "step": 72 }, { "epoch": 0.6239316239316239, "grad_norm": 169.99154663085938, "learning_rate": 9.688034188034188e-06, "loss": 27.6369, "step": 73 }, { "epoch": 0.6324786324786325, "grad_norm": 98.81549072265625, "learning_rate": 9.683760683760685e-06, "loss": 24.5898, "step": 74 }, { "epoch": 0.6410256410256411, "grad_norm": 199.0179443359375, "learning_rate": 9.67948717948718e-06, "loss": 27.664, "step": 75 }, { "epoch": 0.6495726495726496, "grad_norm": 129.81033325195312, "learning_rate": 9.675213675213677e-06, "loss": 25.2547, "step": 76 }, { "epoch": 0.6581196581196581, "grad_norm": 140.1121826171875, "learning_rate": 9.670940170940172e-06, "loss": 27.4914, "step": 77 }, { "epoch": 0.6666666666666666, "grad_norm": 139.8365478515625, "learning_rate": 9.666666666666667e-06, "loss": 24.0178, "step": 78 }, { "epoch": 0.6752136752136753, "grad_norm": 147.24945068359375, "learning_rate": 9.662393162393163e-06, "loss": 27.1404, "step": 79 }, { "epoch": 0.6837606837606838, "grad_norm": 165.67242431640625, "learning_rate": 9.658119658119659e-06, "loss": 25.6604, "step": 80 }, { "epoch": 0.6923076923076923, "grad_norm": 114.36772918701172, "learning_rate": 9.653846153846155e-06, "loss": 24.3695, "step": 81 }, { "epoch": 0.7008547008547008, "grad_norm": 149.76258850097656, "learning_rate": 9.64957264957265e-06, "loss": 26.5265, "step": 82 }, { "epoch": 0.7094017094017094, "grad_norm": 121.9085693359375, "learning_rate": 9.645299145299147e-06, "loss": 25.7008, "step": 83 }, { "epoch": 0.717948717948718, "grad_norm": 106.49151611328125, "learning_rate": 9.641025641025642e-06, "loss": 20.5777, "step": 84 }, { "epoch": 0.7264957264957265, "grad_norm": 114.2357406616211, "learning_rate": 9.636752136752137e-06, "loss": 23.3429, "step": 85 }, { "epoch": 0.7350427350427351, "grad_norm": 107.38651275634766, "learning_rate": 9.632478632478634e-06, "loss": 24.6408, "step": 86 }, { "epoch": 0.7435897435897436, "grad_norm": 120.4283218383789, "learning_rate": 9.628205128205129e-06, "loss": 23.4563, "step": 87 }, { "epoch": 0.7521367521367521, "grad_norm": 165.21783447265625, "learning_rate": 9.623931623931626e-06, "loss": 25.878, "step": 88 }, { "epoch": 0.7606837606837606, "grad_norm": 105.8712387084961, "learning_rate": 9.61965811965812e-06, "loss": 23.605, "step": 89 }, { "epoch": 0.7692307692307693, "grad_norm": 147.31253051757812, "learning_rate": 9.615384615384616e-06, "loss": 24.537, "step": 90 }, { "epoch": 0.7777777777777778, "grad_norm": 127.37718963623047, "learning_rate": 9.611111111111112e-06, "loss": 24.6762, "step": 91 }, { "epoch": 0.7863247863247863, "grad_norm": 139.40553283691406, "learning_rate": 9.606837606837607e-06, "loss": 23.6076, "step": 92 }, { "epoch": 0.7948717948717948, "grad_norm": 218.39170837402344, "learning_rate": 9.602564102564104e-06, "loss": 25.2559, "step": 93 }, { "epoch": 0.8034188034188035, "grad_norm": 115.83401489257812, "learning_rate": 9.5982905982906e-06, "loss": 23.6758, "step": 94 }, { "epoch": 0.811965811965812, "grad_norm": 115.8538818359375, "learning_rate": 9.594017094017094e-06, "loss": 24.2789, "step": 95 }, { "epoch": 0.8205128205128205, "grad_norm": 122.31534576416016, "learning_rate": 9.589743589743591e-06, "loss": 23.5114, "step": 96 }, { "epoch": 0.8290598290598291, "grad_norm": 171.58558654785156, "learning_rate": 9.585470085470086e-06, "loss": 24.7028, "step": 97 }, { "epoch": 0.8376068376068376, "grad_norm": 113.29806518554688, "learning_rate": 9.581196581196583e-06, "loss": 24.9667, "step": 98 }, { "epoch": 0.8461538461538461, "grad_norm": 183.74928283691406, "learning_rate": 9.576923076923078e-06, "loss": 24.7776, "step": 99 }, { "epoch": 0.8547008547008547, "grad_norm": 139.84701538085938, "learning_rate": 9.572649572649575e-06, "loss": 22.1558, "step": 100 }, { "epoch": 0.8632478632478633, "grad_norm": 145.9014129638672, "learning_rate": 9.56837606837607e-06, "loss": 23.0282, "step": 101 }, { "epoch": 0.8717948717948718, "grad_norm": 195.9859619140625, "learning_rate": 9.564102564102565e-06, "loss": 23.7194, "step": 102 }, { "epoch": 0.8803418803418803, "grad_norm": 70.51985168457031, "learning_rate": 9.559829059829061e-06, "loss": 16.9605, "step": 103 }, { "epoch": 0.8888888888888888, "grad_norm": 184.04209899902344, "learning_rate": 9.555555555555556e-06, "loss": 23.4229, "step": 104 }, { "epoch": 0.8974358974358975, "grad_norm": 177.86727905273438, "learning_rate": 9.551282051282053e-06, "loss": 23.6004, "step": 105 }, { "epoch": 0.905982905982906, "grad_norm": 154.30784606933594, "learning_rate": 9.547008547008548e-06, "loss": 21.6725, "step": 106 }, { "epoch": 0.9145299145299145, "grad_norm": 104.27069854736328, "learning_rate": 9.542735042735043e-06, "loss": 22.856, "step": 107 }, { "epoch": 0.9230769230769231, "grad_norm": 157.4270477294922, "learning_rate": 9.53846153846154e-06, "loss": 24.398, "step": 108 }, { "epoch": 0.9316239316239316, "grad_norm": 123.56739807128906, "learning_rate": 9.534188034188035e-06, "loss": 20.6925, "step": 109 }, { "epoch": 0.9401709401709402, "grad_norm": 106.64054870605469, "learning_rate": 9.52991452991453e-06, "loss": 23.5794, "step": 110 }, { "epoch": 0.9487179487179487, "grad_norm": 88.68234252929688, "learning_rate": 9.525641025641025e-06, "loss": 20.729, "step": 111 }, { "epoch": 0.9572649572649573, "grad_norm": 91.86422729492188, "learning_rate": 9.521367521367522e-06, "loss": 18.7701, "step": 112 }, { "epoch": 0.9658119658119658, "grad_norm": 118.74354553222656, "learning_rate": 9.517094017094017e-06, "loss": 20.8439, "step": 113 }, { "epoch": 0.9743589743589743, "grad_norm": 120.72904968261719, "learning_rate": 9.512820512820514e-06, "loss": 21.1903, "step": 114 }, { "epoch": 0.9829059829059829, "grad_norm": 107.36665344238281, "learning_rate": 9.508547008547009e-06, "loss": 19.3457, "step": 115 }, { "epoch": 0.9914529914529915, "grad_norm": 131.74441528320312, "learning_rate": 9.504273504273504e-06, "loss": 21.4035, "step": 116 }, { "epoch": 1.0, "grad_norm": 161.97703552246094, "learning_rate": 9.5e-06, "loss": 22.3831, "step": 117 }, { "epoch": 1.0, "eval_loss": 17.230430603027344, "eval_runtime": 9.9187, "eval_samples_per_second": 46.982, "eval_steps_per_second": 5.948, "step": 117 }, { "epoch": 1.0085470085470085, "grad_norm": 109.44770050048828, "learning_rate": 9.495726495726496e-06, "loss": 20.3406, "step": 118 }, { "epoch": 1.017094017094017, "grad_norm": 96.50030517578125, "learning_rate": 9.491452991452992e-06, "loss": 19.8086, "step": 119 }, { "epoch": 1.0256410256410255, "grad_norm": 91.27509307861328, "learning_rate": 9.487179487179487e-06, "loss": 18.8737, "step": 120 }, { "epoch": 1.0341880341880343, "grad_norm": 123.94478607177734, "learning_rate": 9.482905982905984e-06, "loss": 20.1785, "step": 121 }, { "epoch": 1.0427350427350428, "grad_norm": 109.29426574707031, "learning_rate": 9.478632478632479e-06, "loss": 18.8151, "step": 122 }, { "epoch": 1.0512820512820513, "grad_norm": 104.0233383178711, "learning_rate": 9.474358974358974e-06, "loss": 19.6281, "step": 123 }, { "epoch": 1.0598290598290598, "grad_norm": 75.7523193359375, "learning_rate": 9.470085470085471e-06, "loss": 18.5031, "step": 124 }, { "epoch": 1.0683760683760684, "grad_norm": 103.1374740600586, "learning_rate": 9.465811965811966e-06, "loss": 19.6443, "step": 125 }, { "epoch": 1.0769230769230769, "grad_norm": 92.68035888671875, "learning_rate": 9.461538461538463e-06, "loss": 19.7327, "step": 126 }, { "epoch": 1.0854700854700854, "grad_norm": 88.10079193115234, "learning_rate": 9.457264957264958e-06, "loss": 17.8832, "step": 127 }, { "epoch": 1.0940170940170941, "grad_norm": 80.04244232177734, "learning_rate": 9.452991452991453e-06, "loss": 16.4485, "step": 128 }, { "epoch": 1.1025641025641026, "grad_norm": 81.02445983886719, "learning_rate": 9.44871794871795e-06, "loss": 17.3035, "step": 129 }, { "epoch": 1.1111111111111112, "grad_norm": 98.95979309082031, "learning_rate": 9.444444444444445e-06, "loss": 17.5734, "step": 130 }, { "epoch": 1.1196581196581197, "grad_norm": 109.76984405517578, "learning_rate": 9.440170940170941e-06, "loss": 20.3985, "step": 131 }, { "epoch": 1.1282051282051282, "grad_norm": 98.52857208251953, "learning_rate": 9.435897435897436e-06, "loss": 17.7275, "step": 132 }, { "epoch": 1.1367521367521367, "grad_norm": 91.28802490234375, "learning_rate": 9.431623931623931e-06, "loss": 17.9107, "step": 133 }, { "epoch": 1.1452991452991452, "grad_norm": 92.89081573486328, "learning_rate": 9.427350427350428e-06, "loss": 18.2876, "step": 134 }, { "epoch": 1.1538461538461537, "grad_norm": 78.9795150756836, "learning_rate": 9.423076923076923e-06, "loss": 15.5738, "step": 135 }, { "epoch": 1.1623931623931625, "grad_norm": 83.77166748046875, "learning_rate": 9.41880341880342e-06, "loss": 16.0825, "step": 136 }, { "epoch": 1.170940170940171, "grad_norm": 129.62966918945312, "learning_rate": 9.414529914529915e-06, "loss": 18.4077, "step": 137 }, { "epoch": 1.1794871794871795, "grad_norm": 110.26199340820312, "learning_rate": 9.410256410256412e-06, "loss": 17.6436, "step": 138 }, { "epoch": 1.188034188034188, "grad_norm": 95.36865997314453, "learning_rate": 9.405982905982907e-06, "loss": 19.0424, "step": 139 }, { "epoch": 1.1965811965811965, "grad_norm": 98.36263275146484, "learning_rate": 9.401709401709402e-06, "loss": 16.6122, "step": 140 }, { "epoch": 1.205128205128205, "grad_norm": 83.68401336669922, "learning_rate": 9.397435897435899e-06, "loss": 14.9218, "step": 141 }, { "epoch": 1.2136752136752136, "grad_norm": 92.4602279663086, "learning_rate": 9.393162393162394e-06, "loss": 16.3563, "step": 142 }, { "epoch": 1.2222222222222223, "grad_norm": 106.41629791259766, "learning_rate": 9.38888888888889e-06, "loss": 16.4447, "step": 143 }, { "epoch": 1.2307692307692308, "grad_norm": 97.70237731933594, "learning_rate": 9.384615384615385e-06, "loss": 16.8154, "step": 144 }, { "epoch": 1.2393162393162394, "grad_norm": 76.88361358642578, "learning_rate": 9.38034188034188e-06, "loss": 15.7116, "step": 145 }, { "epoch": 1.2478632478632479, "grad_norm": 104.20966339111328, "learning_rate": 9.376068376068377e-06, "loss": 15.2283, "step": 146 }, { "epoch": 1.2564102564102564, "grad_norm": 80.29965209960938, "learning_rate": 9.371794871794872e-06, "loss": 15.3238, "step": 147 }, { "epoch": 1.264957264957265, "grad_norm": 72.6979751586914, "learning_rate": 9.367521367521369e-06, "loss": 14.2293, "step": 148 }, { "epoch": 1.2735042735042734, "grad_norm": 80.29464721679688, "learning_rate": 9.363247863247864e-06, "loss": 11.9706, "step": 149 }, { "epoch": 1.282051282051282, "grad_norm": 97.80663299560547, "learning_rate": 9.358974358974359e-06, "loss": 14.3517, "step": 150 }, { "epoch": 1.2905982905982907, "grad_norm": 75.88921356201172, "learning_rate": 9.354700854700856e-06, "loss": 12.8289, "step": 151 }, { "epoch": 1.2991452991452992, "grad_norm": 75.71963500976562, "learning_rate": 9.35042735042735e-06, "loss": 15.2496, "step": 152 }, { "epoch": 1.3076923076923077, "grad_norm": 84.5454330444336, "learning_rate": 9.346153846153847e-06, "loss": 15.7946, "step": 153 }, { "epoch": 1.3162393162393162, "grad_norm": 92.24919128417969, "learning_rate": 9.341880341880343e-06, "loss": 13.2751, "step": 154 }, { "epoch": 1.3247863247863247, "grad_norm": 76.51255798339844, "learning_rate": 9.33760683760684e-06, "loss": 14.1861, "step": 155 }, { "epoch": 1.3333333333333333, "grad_norm": 74.59149169921875, "learning_rate": 9.333333333333334e-06, "loss": 12.1881, "step": 156 }, { "epoch": 1.341880341880342, "grad_norm": 69.84959411621094, "learning_rate": 9.32905982905983e-06, "loss": 13.1244, "step": 157 }, { "epoch": 1.3504273504273505, "grad_norm": 82.09815979003906, "learning_rate": 9.324786324786326e-06, "loss": 12.7492, "step": 158 }, { "epoch": 1.358974358974359, "grad_norm": 87.25080108642578, "learning_rate": 9.320512820512821e-06, "loss": 15.5268, "step": 159 }, { "epoch": 1.3675213675213675, "grad_norm": 51.60975646972656, "learning_rate": 9.316239316239318e-06, "loss": 10.9868, "step": 160 }, { "epoch": 1.376068376068376, "grad_norm": 65.10023498535156, "learning_rate": 9.311965811965813e-06, "loss": 13.2106, "step": 161 }, { "epoch": 1.3846153846153846, "grad_norm": 86.3865737915039, "learning_rate": 9.307692307692308e-06, "loss": 12.4873, "step": 162 }, { "epoch": 1.393162393162393, "grad_norm": 89.5868911743164, "learning_rate": 9.303418803418805e-06, "loss": 12.3125, "step": 163 }, { "epoch": 1.4017094017094016, "grad_norm": 87.308837890625, "learning_rate": 9.2991452991453e-06, "loss": 13.1855, "step": 164 }, { "epoch": 1.4102564102564101, "grad_norm": 79.86372375488281, "learning_rate": 9.294871794871796e-06, "loss": 11.2756, "step": 165 }, { "epoch": 1.4188034188034189, "grad_norm": 64.42597961425781, "learning_rate": 9.290598290598292e-06, "loss": 11.7395, "step": 166 }, { "epoch": 1.4273504273504274, "grad_norm": 64.65245056152344, "learning_rate": 9.286324786324787e-06, "loss": 10.2739, "step": 167 }, { "epoch": 1.435897435897436, "grad_norm": 49.57310104370117, "learning_rate": 9.282051282051283e-06, "loss": 11.4798, "step": 168 }, { "epoch": 1.4444444444444444, "grad_norm": 89.93653106689453, "learning_rate": 9.277777777777778e-06, "loss": 13.8041, "step": 169 }, { "epoch": 1.452991452991453, "grad_norm": 59.6973876953125, "learning_rate": 9.273504273504275e-06, "loss": 11.0414, "step": 170 }, { "epoch": 1.4615384615384617, "grad_norm": 63.07640838623047, "learning_rate": 9.26923076923077e-06, "loss": 10.2649, "step": 171 }, { "epoch": 1.4700854700854702, "grad_norm": 121.3633041381836, "learning_rate": 9.264957264957267e-06, "loss": 11.9233, "step": 172 }, { "epoch": 1.4786324786324787, "grad_norm": 50.96989822387695, "learning_rate": 9.260683760683762e-06, "loss": 8.3527, "step": 173 }, { "epoch": 1.4871794871794872, "grad_norm": 71.61744689941406, "learning_rate": 9.256410256410257e-06, "loss": 11.4237, "step": 174 }, { "epoch": 1.4957264957264957, "grad_norm": 69.43048858642578, "learning_rate": 9.252136752136754e-06, "loss": 9.9193, "step": 175 }, { "epoch": 1.5042735042735043, "grad_norm": 130.2714385986328, "learning_rate": 9.247863247863249e-06, "loss": 12.0676, "step": 176 }, { "epoch": 1.5128205128205128, "grad_norm": 51.40456008911133, "learning_rate": 9.243589743589745e-06, "loss": 9.2348, "step": 177 }, { "epoch": 1.5213675213675213, "grad_norm": 48.94670486450195, "learning_rate": 9.23931623931624e-06, "loss": 8.8217, "step": 178 }, { "epoch": 1.5299145299145298, "grad_norm": 54.54533386230469, "learning_rate": 9.235042735042736e-06, "loss": 9.2478, "step": 179 }, { "epoch": 1.5384615384615383, "grad_norm": 46.581939697265625, "learning_rate": 9.230769230769232e-06, "loss": 8.746, "step": 180 }, { "epoch": 1.547008547008547, "grad_norm": 49.31954574584961, "learning_rate": 9.226495726495727e-06, "loss": 8.7889, "step": 181 }, { "epoch": 1.5555555555555556, "grad_norm": 48.5145378112793, "learning_rate": 9.222222222222224e-06, "loss": 8.4478, "step": 182 }, { "epoch": 1.564102564102564, "grad_norm": 49.587825775146484, "learning_rate": 9.217948717948717e-06, "loss": 10.5022, "step": 183 }, { "epoch": 1.5726495726495726, "grad_norm": 47.89423751831055, "learning_rate": 9.213675213675214e-06, "loss": 8.7681, "step": 184 }, { "epoch": 1.5811965811965814, "grad_norm": 59.971920013427734, "learning_rate": 9.20940170940171e-06, "loss": 9.6469, "step": 185 }, { "epoch": 1.5897435897435899, "grad_norm": 41.139957427978516, "learning_rate": 9.205128205128206e-06, "loss": 8.5196, "step": 186 }, { "epoch": 1.5982905982905984, "grad_norm": 36.8078498840332, "learning_rate": 9.200854700854701e-06, "loss": 8.2513, "step": 187 }, { "epoch": 1.606837606837607, "grad_norm": 62.23011016845703, "learning_rate": 9.196581196581196e-06, "loss": 9.239, "step": 188 }, { "epoch": 1.6153846153846154, "grad_norm": 41.35377502441406, "learning_rate": 9.192307692307693e-06, "loss": 8.6788, "step": 189 }, { "epoch": 1.623931623931624, "grad_norm": 53.734134674072266, "learning_rate": 9.188034188034188e-06, "loss": 8.2624, "step": 190 }, { "epoch": 1.6324786324786325, "grad_norm": 60.738887786865234, "learning_rate": 9.183760683760685e-06, "loss": 9.2777, "step": 191 }, { "epoch": 1.641025641025641, "grad_norm": 26.411643981933594, "learning_rate": 9.17948717948718e-06, "loss": 7.6894, "step": 192 }, { "epoch": 1.6495726495726495, "grad_norm": 37.81135940551758, "learning_rate": 9.175213675213676e-06, "loss": 8.009, "step": 193 }, { "epoch": 1.658119658119658, "grad_norm": 42.451080322265625, "learning_rate": 9.170940170940171e-06, "loss": 8.309, "step": 194 }, { "epoch": 1.6666666666666665, "grad_norm": 54.87519073486328, "learning_rate": 9.166666666666666e-06, "loss": 8.3505, "step": 195 }, { "epoch": 1.6752136752136753, "grad_norm": 47.997737884521484, "learning_rate": 9.162393162393163e-06, "loss": 8.9444, "step": 196 }, { "epoch": 1.6837606837606838, "grad_norm": 33.1911506652832, "learning_rate": 9.158119658119658e-06, "loss": 6.8856, "step": 197 }, { "epoch": 1.6923076923076923, "grad_norm": 28.42953872680664, "learning_rate": 9.153846153846155e-06, "loss": 7.0575, "step": 198 }, { "epoch": 1.7008547008547008, "grad_norm": 34.74330520629883, "learning_rate": 9.14957264957265e-06, "loss": 7.6837, "step": 199 }, { "epoch": 1.7094017094017095, "grad_norm": 27.730812072753906, "learning_rate": 9.145299145299145e-06, "loss": 7.2591, "step": 200 }, { "epoch": 1.717948717948718, "grad_norm": 36.658966064453125, "learning_rate": 9.141025641025642e-06, "loss": 7.6744, "step": 201 }, { "epoch": 1.7264957264957266, "grad_norm": 52.580074310302734, "learning_rate": 9.136752136752137e-06, "loss": 8.9746, "step": 202 }, { "epoch": 1.735042735042735, "grad_norm": 26.30430030822754, "learning_rate": 9.132478632478634e-06, "loss": 7.0829, "step": 203 }, { "epoch": 1.7435897435897436, "grad_norm": 35.77456283569336, "learning_rate": 9.128205128205129e-06, "loss": 7.46, "step": 204 }, { "epoch": 1.7521367521367521, "grad_norm": 46.80126953125, "learning_rate": 9.123931623931624e-06, "loss": 8.0331, "step": 205 }, { "epoch": 1.7606837606837606, "grad_norm": 26.510988235473633, "learning_rate": 9.11965811965812e-06, "loss": 7.0434, "step": 206 }, { "epoch": 1.7692307692307692, "grad_norm": 30.846357345581055, "learning_rate": 9.115384615384615e-06, "loss": 6.9022, "step": 207 }, { "epoch": 1.7777777777777777, "grad_norm": 45.06099319458008, "learning_rate": 9.111111111111112e-06, "loss": 7.108, "step": 208 }, { "epoch": 1.7863247863247862, "grad_norm": 40.050079345703125, "learning_rate": 9.106837606837607e-06, "loss": 7.3628, "step": 209 }, { "epoch": 1.7948717948717947, "grad_norm": 32.066261291503906, "learning_rate": 9.102564102564104e-06, "loss": 7.3292, "step": 210 }, { "epoch": 1.8034188034188035, "grad_norm": 29.196252822875977, "learning_rate": 9.098290598290599e-06, "loss": 6.6194, "step": 211 }, { "epoch": 1.811965811965812, "grad_norm": 34.54549026489258, "learning_rate": 9.094017094017094e-06, "loss": 7.224, "step": 212 }, { "epoch": 1.8205128205128205, "grad_norm": 31.863550186157227, "learning_rate": 9.08974358974359e-06, "loss": 7.141, "step": 213 }, { "epoch": 1.8290598290598292, "grad_norm": 36.79090118408203, "learning_rate": 9.085470085470086e-06, "loss": 6.9572, "step": 214 }, { "epoch": 1.8376068376068377, "grad_norm": 24.298635482788086, "learning_rate": 9.081196581196583e-06, "loss": 6.6881, "step": 215 }, { "epoch": 1.8461538461538463, "grad_norm": 16.75456428527832, "learning_rate": 9.076923076923078e-06, "loss": 6.4055, "step": 216 }, { "epoch": 1.8547008547008548, "grad_norm": 20.152400970458984, "learning_rate": 9.072649572649573e-06, "loss": 6.9078, "step": 217 }, { "epoch": 1.8632478632478633, "grad_norm": 34.73337173461914, "learning_rate": 9.06837606837607e-06, "loss": 6.7923, "step": 218 }, { "epoch": 1.8717948717948718, "grad_norm": 28.418310165405273, "learning_rate": 9.064102564102564e-06, "loss": 6.9382, "step": 219 }, { "epoch": 1.8803418803418803, "grad_norm": 13.454174995422363, "learning_rate": 9.059829059829061e-06, "loss": 4.5504, "step": 220 }, { "epoch": 1.8888888888888888, "grad_norm": 20.746938705444336, "learning_rate": 9.055555555555556e-06, "loss": 6.4711, "step": 221 }, { "epoch": 1.8974358974358974, "grad_norm": 23.29437828063965, "learning_rate": 9.051282051282051e-06, "loss": 6.1381, "step": 222 }, { "epoch": 1.9059829059829059, "grad_norm": 31.720672607421875, "learning_rate": 9.047008547008548e-06, "loss": 6.7716, "step": 223 }, { "epoch": 1.9145299145299144, "grad_norm": 16.971572875976562, "learning_rate": 9.042735042735043e-06, "loss": 6.4734, "step": 224 }, { "epoch": 1.9230769230769231, "grad_norm": 25.185396194458008, "learning_rate": 9.03846153846154e-06, "loss": 6.2505, "step": 225 }, { "epoch": 1.9316239316239316, "grad_norm": 42.373863220214844, "learning_rate": 9.034188034188035e-06, "loss": 7.1968, "step": 226 }, { "epoch": 1.9401709401709402, "grad_norm": 21.06004524230957, "learning_rate": 9.029914529914532e-06, "loss": 6.082, "step": 227 }, { "epoch": 1.9487179487179487, "grad_norm": 21.413599014282227, "learning_rate": 9.025641025641027e-06, "loss": 6.2279, "step": 228 }, { "epoch": 1.9572649572649574, "grad_norm": 18.379974365234375, "learning_rate": 9.021367521367522e-06, "loss": 6.6032, "step": 229 }, { "epoch": 1.965811965811966, "grad_norm": 28.239042282104492, "learning_rate": 9.017094017094018e-06, "loss": 6.5428, "step": 230 }, { "epoch": 1.9743589743589745, "grad_norm": 17.92879867553711, "learning_rate": 9.012820512820513e-06, "loss": 5.986, "step": 231 }, { "epoch": 1.982905982905983, "grad_norm": 15.501392364501953, "learning_rate": 9.00854700854701e-06, "loss": 5.9526, "step": 232 }, { "epoch": 1.9914529914529915, "grad_norm": 23.742633819580078, "learning_rate": 9.004273504273505e-06, "loss": 6.2462, "step": 233 }, { "epoch": 2.0, "grad_norm": 28.22560691833496, "learning_rate": 9e-06, "loss": 5.8705, "step": 234 }, { "epoch": 2.0, "eval_loss": 5.379393577575684, "eval_runtime": 9.2791, "eval_samples_per_second": 50.22, "eval_steps_per_second": 6.358, "step": 234 }, { "epoch": 2.0085470085470085, "grad_norm": 21.7072696685791, "learning_rate": 8.995726495726497e-06, "loss": 6.2757, "step": 235 }, { "epoch": 2.017094017094017, "grad_norm": 20.955190658569336, "learning_rate": 8.991452991452992e-06, "loss": 5.7265, "step": 236 }, { "epoch": 2.0256410256410255, "grad_norm": 15.186567306518555, "learning_rate": 8.987179487179489e-06, "loss": 6.1958, "step": 237 }, { "epoch": 2.034188034188034, "grad_norm": 20.938766479492188, "learning_rate": 8.982905982905984e-06, "loss": 6.2317, "step": 238 }, { "epoch": 2.0427350427350426, "grad_norm": 18.457494735717773, "learning_rate": 8.978632478632479e-06, "loss": 6.4711, "step": 239 }, { "epoch": 2.051282051282051, "grad_norm": 43.505149841308594, "learning_rate": 8.974358974358976e-06, "loss": 5.9632, "step": 240 }, { "epoch": 2.0598290598290596, "grad_norm": 15.558544158935547, "learning_rate": 8.97008547008547e-06, "loss": 5.8099, "step": 241 }, { "epoch": 2.0683760683760686, "grad_norm": 22.20660400390625, "learning_rate": 8.965811965811967e-06, "loss": 5.7939, "step": 242 }, { "epoch": 2.076923076923077, "grad_norm": 15.866617202758789, "learning_rate": 8.961538461538462e-06, "loss": 5.9473, "step": 243 }, { "epoch": 2.0854700854700856, "grad_norm": 20.30729103088379, "learning_rate": 8.957264957264959e-06, "loss": 6.2028, "step": 244 }, { "epoch": 2.094017094017094, "grad_norm": 15.517614364624023, "learning_rate": 8.952991452991454e-06, "loss": 5.906, "step": 245 }, { "epoch": 2.1025641025641026, "grad_norm": 21.30764389038086, "learning_rate": 8.94871794871795e-06, "loss": 6.1907, "step": 246 }, { "epoch": 2.111111111111111, "grad_norm": 19.973115921020508, "learning_rate": 8.944444444444446e-06, "loss": 5.6895, "step": 247 }, { "epoch": 2.1196581196581197, "grad_norm": 17.40595817565918, "learning_rate": 8.940170940170941e-06, "loss": 5.4836, "step": 248 }, { "epoch": 2.128205128205128, "grad_norm": 27.667421340942383, "learning_rate": 8.935897435897438e-06, "loss": 5.9082, "step": 249 }, { "epoch": 2.1367521367521367, "grad_norm": 18.151315689086914, "learning_rate": 8.931623931623933e-06, "loss": 5.8102, "step": 250 }, { "epoch": 2.1452991452991452, "grad_norm": 15.390297889709473, "learning_rate": 8.927350427350428e-06, "loss": 5.5504, "step": 251 }, { "epoch": 2.1538461538461537, "grad_norm": 17.257841110229492, "learning_rate": 8.923076923076925e-06, "loss": 5.9043, "step": 252 }, { "epoch": 2.1623931623931623, "grad_norm": 19.2503604888916, "learning_rate": 8.91880341880342e-06, "loss": 5.8349, "step": 253 }, { "epoch": 2.1709401709401708, "grad_norm": 25.236759185791016, "learning_rate": 8.914529914529916e-06, "loss": 5.2908, "step": 254 }, { "epoch": 2.1794871794871793, "grad_norm": 13.771193504333496, "learning_rate": 8.910256410256411e-06, "loss": 5.4743, "step": 255 }, { "epoch": 2.1880341880341883, "grad_norm": 17.406471252441406, "learning_rate": 8.905982905982906e-06, "loss": 5.6856, "step": 256 }, { "epoch": 2.1965811965811968, "grad_norm": 14.727091789245605, "learning_rate": 8.901709401709401e-06, "loss": 5.7937, "step": 257 }, { "epoch": 2.2051282051282053, "grad_norm": 18.193246841430664, "learning_rate": 8.897435897435898e-06, "loss": 5.5704, "step": 258 }, { "epoch": 2.213675213675214, "grad_norm": 21.573726654052734, "learning_rate": 8.893162393162393e-06, "loss": 5.479, "step": 259 }, { "epoch": 2.2222222222222223, "grad_norm": 28.72640037536621, "learning_rate": 8.888888888888888e-06, "loss": 5.5096, "step": 260 }, { "epoch": 2.230769230769231, "grad_norm": 15.4992094039917, "learning_rate": 8.884615384615385e-06, "loss": 5.217, "step": 261 }, { "epoch": 2.2393162393162394, "grad_norm": 17.753416061401367, "learning_rate": 8.88034188034188e-06, "loss": 5.8173, "step": 262 }, { "epoch": 2.247863247863248, "grad_norm": 15.91961669921875, "learning_rate": 8.876068376068377e-06, "loss": 5.7171, "step": 263 }, { "epoch": 2.2564102564102564, "grad_norm": 23.30504035949707, "learning_rate": 8.871794871794872e-06, "loss": 5.6214, "step": 264 }, { "epoch": 2.264957264957265, "grad_norm": 15.583686828613281, "learning_rate": 8.867521367521369e-06, "loss": 5.2343, "step": 265 }, { "epoch": 2.2735042735042734, "grad_norm": 24.482046127319336, "learning_rate": 8.863247863247864e-06, "loss": 5.0747, "step": 266 }, { "epoch": 2.282051282051282, "grad_norm": 16.17924690246582, "learning_rate": 8.858974358974359e-06, "loss": 5.2645, "step": 267 }, { "epoch": 2.2905982905982905, "grad_norm": 19.538314819335938, "learning_rate": 8.854700854700855e-06, "loss": 5.3484, "step": 268 }, { "epoch": 2.299145299145299, "grad_norm": 14.472186088562012, "learning_rate": 8.85042735042735e-06, "loss": 5.8159, "step": 269 }, { "epoch": 2.3076923076923075, "grad_norm": 16.797805786132812, "learning_rate": 8.846153846153847e-06, "loss": 5.4466, "step": 270 }, { "epoch": 2.316239316239316, "grad_norm": 13.237580299377441, "learning_rate": 8.841880341880342e-06, "loss": 5.2189, "step": 271 }, { "epoch": 2.324786324786325, "grad_norm": 16.685317993164062, "learning_rate": 8.837606837606837e-06, "loss": 5.7098, "step": 272 }, { "epoch": 2.3333333333333335, "grad_norm": 16.63880729675293, "learning_rate": 8.833333333333334e-06, "loss": 5.0714, "step": 273 }, { "epoch": 2.341880341880342, "grad_norm": 20.871978759765625, "learning_rate": 8.829059829059829e-06, "loss": 4.9509, "step": 274 }, { "epoch": 2.3504273504273505, "grad_norm": 16.95268440246582, "learning_rate": 8.824786324786326e-06, "loss": 5.4166, "step": 275 }, { "epoch": 2.358974358974359, "grad_norm": 15.446279525756836, "learning_rate": 8.820512820512821e-06, "loss": 4.5967, "step": 276 }, { "epoch": 2.3675213675213675, "grad_norm": 17.148235321044922, "learning_rate": 8.816239316239316e-06, "loss": 5.2542, "step": 277 }, { "epoch": 2.376068376068376, "grad_norm": 17.014827728271484, "learning_rate": 8.811965811965813e-06, "loss": 5.4702, "step": 278 }, { "epoch": 2.3846153846153846, "grad_norm": 15.313383102416992, "learning_rate": 8.807692307692308e-06, "loss": 5.2119, "step": 279 }, { "epoch": 2.393162393162393, "grad_norm": 20.2298641204834, "learning_rate": 8.803418803418804e-06, "loss": 5.4064, "step": 280 }, { "epoch": 2.4017094017094016, "grad_norm": 14.982254981994629, "learning_rate": 8.7991452991453e-06, "loss": 5.2545, "step": 281 }, { "epoch": 2.41025641025641, "grad_norm": 16.258047103881836, "learning_rate": 8.794871794871796e-06, "loss": 5.0141, "step": 282 }, { "epoch": 2.4188034188034186, "grad_norm": 22.5199031829834, "learning_rate": 8.790598290598291e-06, "loss": 5.3486, "step": 283 }, { "epoch": 2.427350427350427, "grad_norm": 17.546480178833008, "learning_rate": 8.786324786324786e-06, "loss": 5.2785, "step": 284 }, { "epoch": 2.435897435897436, "grad_norm": 22.07866668701172, "learning_rate": 8.782051282051283e-06, "loss": 5.4471, "step": 285 }, { "epoch": 2.4444444444444446, "grad_norm": 409.2532043457031, "learning_rate": 8.777777777777778e-06, "loss": 6.0948, "step": 286 }, { "epoch": 2.452991452991453, "grad_norm": 185.7334747314453, "learning_rate": 8.773504273504275e-06, "loss": 5.5538, "step": 287 }, { "epoch": 2.4615384615384617, "grad_norm": 30.8182430267334, "learning_rate": 8.76923076923077e-06, "loss": 4.9661, "step": 288 }, { "epoch": 2.47008547008547, "grad_norm": 18.584409713745117, "learning_rate": 8.764957264957265e-06, "loss": 5.0947, "step": 289 }, { "epoch": 2.4786324786324787, "grad_norm": 18.128522872924805, "learning_rate": 8.760683760683762e-06, "loss": 4.8816, "step": 290 }, { "epoch": 2.4871794871794872, "grad_norm": 18.800090789794922, "learning_rate": 8.756410256410257e-06, "loss": 5.0952, "step": 291 }, { "epoch": 2.4957264957264957, "grad_norm": 22.140430450439453, "learning_rate": 8.752136752136753e-06, "loss": 4.5408, "step": 292 }, { "epoch": 2.5042735042735043, "grad_norm": 19.867111206054688, "learning_rate": 8.747863247863248e-06, "loss": 4.7435, "step": 293 }, { "epoch": 2.5128205128205128, "grad_norm": 19.437868118286133, "learning_rate": 8.743589743589743e-06, "loss": 5.2643, "step": 294 }, { "epoch": 2.5213675213675213, "grad_norm": 18.256561279296875, "learning_rate": 8.73931623931624e-06, "loss": 5.2531, "step": 295 }, { "epoch": 2.52991452991453, "grad_norm": 18.65209197998047, "learning_rate": 8.735042735042735e-06, "loss": 4.8646, "step": 296 }, { "epoch": 2.5384615384615383, "grad_norm": 14.704927444458008, "learning_rate": 8.730769230769232e-06, "loss": 4.8343, "step": 297 }, { "epoch": 2.547008547008547, "grad_norm": 15.522851943969727, "learning_rate": 8.726495726495727e-06, "loss": 4.898, "step": 298 }, { "epoch": 2.5555555555555554, "grad_norm": 21.7825927734375, "learning_rate": 8.722222222222224e-06, "loss": 5.0732, "step": 299 }, { "epoch": 2.564102564102564, "grad_norm": 17.963552474975586, "learning_rate": 8.717948717948719e-06, "loss": 4.9684, "step": 300 }, { "epoch": 2.5726495726495724, "grad_norm": 16.14459991455078, "learning_rate": 8.713675213675214e-06, "loss": 4.8802, "step": 301 }, { "epoch": 2.5811965811965814, "grad_norm": 18.386646270751953, "learning_rate": 8.70940170940171e-06, "loss": 4.8837, "step": 302 }, { "epoch": 2.58974358974359, "grad_norm": 19.471376419067383, "learning_rate": 8.705128205128206e-06, "loss": 4.6325, "step": 303 }, { "epoch": 2.5982905982905984, "grad_norm": 17.839717864990234, "learning_rate": 8.700854700854702e-06, "loss": 4.7851, "step": 304 }, { "epoch": 2.606837606837607, "grad_norm": 26.519363403320312, "learning_rate": 8.696581196581197e-06, "loss": 5.0576, "step": 305 }, { "epoch": 2.6153846153846154, "grad_norm": 14.135244369506836, "learning_rate": 8.692307692307692e-06, "loss": 4.7719, "step": 306 }, { "epoch": 2.623931623931624, "grad_norm": 16.5241641998291, "learning_rate": 8.68803418803419e-06, "loss": 4.5826, "step": 307 }, { "epoch": 2.6324786324786325, "grad_norm": 23.982437133789062, "learning_rate": 8.683760683760684e-06, "loss": 4.4878, "step": 308 }, { "epoch": 2.641025641025641, "grad_norm": 16.036361694335938, "learning_rate": 8.679487179487181e-06, "loss": 4.3867, "step": 309 }, { "epoch": 2.6495726495726495, "grad_norm": 16.19298553466797, "learning_rate": 8.675213675213676e-06, "loss": 4.763, "step": 310 }, { "epoch": 2.658119658119658, "grad_norm": 19.32802963256836, "learning_rate": 8.670940170940171e-06, "loss": 4.4083, "step": 311 }, { "epoch": 2.6666666666666665, "grad_norm": 21.75898551940918, "learning_rate": 8.666666666666668e-06, "loss": 4.8782, "step": 312 }, { "epoch": 2.6752136752136755, "grad_norm": 905.6954956054688, "learning_rate": 8.662393162393163e-06, "loss": 5.7901, "step": 313 }, { "epoch": 2.683760683760684, "grad_norm": 21.126985549926758, "learning_rate": 8.65811965811966e-06, "loss": 4.918, "step": 314 }, { "epoch": 2.6923076923076925, "grad_norm": 22.190237045288086, "learning_rate": 8.653846153846155e-06, "loss": 4.4327, "step": 315 }, { "epoch": 2.700854700854701, "grad_norm": 90.69184875488281, "learning_rate": 8.649572649572651e-06, "loss": 5.1477, "step": 316 }, { "epoch": 2.7094017094017095, "grad_norm": 43.43864059448242, "learning_rate": 8.645299145299146e-06, "loss": 4.5476, "step": 317 }, { "epoch": 2.717948717948718, "grad_norm": 19.24538230895996, "learning_rate": 8.641025641025641e-06, "loss": 4.4304, "step": 318 }, { "epoch": 2.7264957264957266, "grad_norm": 21.809600830078125, "learning_rate": 8.636752136752138e-06, "loss": 4.4215, "step": 319 }, { "epoch": 2.735042735042735, "grad_norm": 21.406156539916992, "learning_rate": 8.632478632478633e-06, "loss": 4.5411, "step": 320 }, { "epoch": 2.7435897435897436, "grad_norm": 17.57236099243164, "learning_rate": 8.62820512820513e-06, "loss": 4.7952, "step": 321 }, { "epoch": 2.752136752136752, "grad_norm": 21.049169540405273, "learning_rate": 8.623931623931625e-06, "loss": 4.4596, "step": 322 }, { "epoch": 2.7606837606837606, "grad_norm": 20.04981803894043, "learning_rate": 8.61965811965812e-06, "loss": 4.4705, "step": 323 }, { "epoch": 2.769230769230769, "grad_norm": 21.146499633789062, "learning_rate": 8.615384615384617e-06, "loss": 4.6081, "step": 324 }, { "epoch": 2.7777777777777777, "grad_norm": 20.9805908203125, "learning_rate": 8.611111111111112e-06, "loss": 4.8387, "step": 325 }, { "epoch": 2.786324786324786, "grad_norm": 17.708343505859375, "learning_rate": 8.606837606837609e-06, "loss": 4.3455, "step": 326 }, { "epoch": 2.7948717948717947, "grad_norm": 25.657032012939453, "learning_rate": 8.602564102564104e-06, "loss": 4.3119, "step": 327 }, { "epoch": 2.8034188034188032, "grad_norm": 17.713972091674805, "learning_rate": 8.598290598290599e-06, "loss": 4.5597, "step": 328 }, { "epoch": 2.8119658119658117, "grad_norm": 22.297082901000977, "learning_rate": 8.594017094017095e-06, "loss": 3.8398, "step": 329 }, { "epoch": 2.8205128205128203, "grad_norm": 16.11454200744629, "learning_rate": 8.58974358974359e-06, "loss": 3.2049, "step": 330 }, { "epoch": 2.8290598290598292, "grad_norm": 27.323585510253906, "learning_rate": 8.585470085470086e-06, "loss": 4.0371, "step": 331 }, { "epoch": 2.8376068376068377, "grad_norm": 21.090797424316406, "learning_rate": 8.58119658119658e-06, "loss": 4.5193, "step": 332 }, { "epoch": 2.8461538461538463, "grad_norm": 39.087432861328125, "learning_rate": 8.576923076923077e-06, "loss": 4.3537, "step": 333 }, { "epoch": 2.8547008547008548, "grad_norm": 18.49846839904785, "learning_rate": 8.572649572649572e-06, "loss": 4.614, "step": 334 }, { "epoch": 2.8632478632478633, "grad_norm": 26.671632766723633, "learning_rate": 8.568376068376069e-06, "loss": 4.2224, "step": 335 }, { "epoch": 2.871794871794872, "grad_norm": 25.799545288085938, "learning_rate": 8.564102564102564e-06, "loss": 4.2209, "step": 336 }, { "epoch": 2.8803418803418803, "grad_norm": 20.131961822509766, "learning_rate": 8.559829059829061e-06, "loss": 4.5194, "step": 337 }, { "epoch": 2.888888888888889, "grad_norm": 20.193859100341797, "learning_rate": 8.555555555555556e-06, "loss": 3.9966, "step": 338 }, { "epoch": 2.8974358974358974, "grad_norm": 20.06737518310547, "learning_rate": 8.551282051282051e-06, "loss": 3.7394, "step": 339 }, { "epoch": 2.905982905982906, "grad_norm": 438.34429931640625, "learning_rate": 8.547008547008548e-06, "loss": 5.1558, "step": 340 }, { "epoch": 2.9145299145299144, "grad_norm": 22.152528762817383, "learning_rate": 8.542735042735043e-06, "loss": 3.9014, "step": 341 }, { "epoch": 2.9230769230769234, "grad_norm": 29.279739379882812, "learning_rate": 8.53846153846154e-06, "loss": 4.0479, "step": 342 }, { "epoch": 2.931623931623932, "grad_norm": 26.182645797729492, "learning_rate": 8.534188034188035e-06, "loss": 4.2022, "step": 343 }, { "epoch": 2.9401709401709404, "grad_norm": 22.329736709594727, "learning_rate": 8.52991452991453e-06, "loss": 3.8777, "step": 344 }, { "epoch": 2.948717948717949, "grad_norm": 20.62833023071289, "learning_rate": 8.525641025641026e-06, "loss": 4.2189, "step": 345 }, { "epoch": 2.9572649572649574, "grad_norm": 20.176612854003906, "learning_rate": 8.521367521367521e-06, "loss": 4.0124, "step": 346 }, { "epoch": 2.965811965811966, "grad_norm": 18.77017593383789, "learning_rate": 8.517094017094018e-06, "loss": 3.3286, "step": 347 }, { "epoch": 2.9743589743589745, "grad_norm": 226.93701171875, "learning_rate": 8.512820512820513e-06, "loss": 4.6969, "step": 348 }, { "epoch": 2.982905982905983, "grad_norm": 675.1133422851562, "learning_rate": 8.508547008547008e-06, "loss": 4.6717, "step": 349 }, { "epoch": 2.9914529914529915, "grad_norm": 19.938486099243164, "learning_rate": 8.504273504273505e-06, "loss": 4.0103, "step": 350 }, { "epoch": 3.0, "grad_norm": 15.917003631591797, "learning_rate": 8.5e-06, "loss": 3.1643, "step": 351 }, { "epoch": 3.0, "eval_loss": 3.4197537899017334, "eval_runtime": 9.289, "eval_samples_per_second": 50.167, "eval_steps_per_second": 6.352, "step": 351 }, { "epoch": 3.0085470085470085, "grad_norm": 22.22833251953125, "learning_rate": 8.495726495726497e-06, "loss": 4.3458, "step": 352 }, { "epoch": 3.017094017094017, "grad_norm": 16.4627685546875, "learning_rate": 8.491452991452992e-06, "loss": 3.5374, "step": 353 }, { "epoch": 3.0256410256410255, "grad_norm": 16.389379501342773, "learning_rate": 8.487179487179488e-06, "loss": 4.1384, "step": 354 }, { "epoch": 3.034188034188034, "grad_norm": 19.589706420898438, "learning_rate": 8.482905982905983e-06, "loss": 3.9522, "step": 355 }, { "epoch": 3.0427350427350426, "grad_norm": 21.66250228881836, "learning_rate": 8.478632478632479e-06, "loss": 4.0197, "step": 356 }, { "epoch": 3.051282051282051, "grad_norm": 42.1422119140625, "learning_rate": 8.474358974358975e-06, "loss": 3.9432, "step": 357 }, { "epoch": 3.0598290598290596, "grad_norm": 23.0153751373291, "learning_rate": 8.47008547008547e-06, "loss": 3.9146, "step": 358 }, { "epoch": 3.0683760683760686, "grad_norm": 20.847400665283203, "learning_rate": 8.465811965811967e-06, "loss": 3.9736, "step": 359 }, { "epoch": 3.076923076923077, "grad_norm": 23.553855895996094, "learning_rate": 8.461538461538462e-06, "loss": 3.646, "step": 360 }, { "epoch": 3.0854700854700856, "grad_norm": 18.651151657104492, "learning_rate": 8.457264957264957e-06, "loss": 3.761, "step": 361 }, { "epoch": 3.094017094017094, "grad_norm": 23.437379837036133, "learning_rate": 8.452991452991454e-06, "loss": 3.9258, "step": 362 }, { "epoch": 3.1025641025641026, "grad_norm": 19.025928497314453, "learning_rate": 8.448717948717949e-06, "loss": 3.4911, "step": 363 }, { "epoch": 3.111111111111111, "grad_norm": 25.955963134765625, "learning_rate": 8.444444444444446e-06, "loss": 3.7231, "step": 364 }, { "epoch": 3.1196581196581197, "grad_norm": 19.691673278808594, "learning_rate": 8.44017094017094e-06, "loss": 3.9225, "step": 365 }, { "epoch": 3.128205128205128, "grad_norm": 19.47168731689453, "learning_rate": 8.435897435897436e-06, "loss": 3.6261, "step": 366 }, { "epoch": 3.1367521367521367, "grad_norm": 20.50010108947754, "learning_rate": 8.431623931623932e-06, "loss": 3.3306, "step": 367 }, { "epoch": 3.1452991452991452, "grad_norm": 21.198938369750977, "learning_rate": 8.427350427350428e-06, "loss": 3.6388, "step": 368 }, { "epoch": 3.1538461538461537, "grad_norm": 16.93203353881836, "learning_rate": 8.423076923076924e-06, "loss": 3.9556, "step": 369 }, { "epoch": 3.1623931623931623, "grad_norm": 15.074128150939941, "learning_rate": 8.41880341880342e-06, "loss": 2.9899, "step": 370 }, { "epoch": 3.1709401709401708, "grad_norm": 23.041452407836914, "learning_rate": 8.414529914529916e-06, "loss": 3.291, "step": 371 }, { "epoch": 3.1794871794871793, "grad_norm": 24.146419525146484, "learning_rate": 8.410256410256411e-06, "loss": 4.0683, "step": 372 }, { "epoch": 3.1880341880341883, "grad_norm": 27.864879608154297, "learning_rate": 8.405982905982906e-06, "loss": 3.6171, "step": 373 }, { "epoch": 3.1965811965811968, "grad_norm": 33.83136749267578, "learning_rate": 8.401709401709403e-06, "loss": 3.7324, "step": 374 }, { "epoch": 3.2051282051282053, "grad_norm": 21.020702362060547, "learning_rate": 8.397435897435898e-06, "loss": 3.5688, "step": 375 }, { "epoch": 3.213675213675214, "grad_norm": 23.521453857421875, "learning_rate": 8.393162393162395e-06, "loss": 3.6917, "step": 376 }, { "epoch": 3.2222222222222223, "grad_norm": 35.85578536987305, "learning_rate": 8.38888888888889e-06, "loss": 3.6532, "step": 377 }, { "epoch": 3.230769230769231, "grad_norm": 26.080968856811523, "learning_rate": 8.384615384615385e-06, "loss": 3.8828, "step": 378 }, { "epoch": 3.2393162393162394, "grad_norm": 20.829381942749023, "learning_rate": 8.380341880341881e-06, "loss": 3.8374, "step": 379 }, { "epoch": 3.247863247863248, "grad_norm": 20.85077476501465, "learning_rate": 8.376068376068377e-06, "loss": 3.2896, "step": 380 }, { "epoch": 3.2564102564102564, "grad_norm": 19.036088943481445, "learning_rate": 8.371794871794873e-06, "loss": 3.4996, "step": 381 }, { "epoch": 3.264957264957265, "grad_norm": 23.725513458251953, "learning_rate": 8.367521367521368e-06, "loss": 3.7686, "step": 382 }, { "epoch": 3.2735042735042734, "grad_norm": 22.553386688232422, "learning_rate": 8.363247863247865e-06, "loss": 3.8476, "step": 383 }, { "epoch": 3.282051282051282, "grad_norm": 20.263992309570312, "learning_rate": 8.35897435897436e-06, "loss": 3.3278, "step": 384 }, { "epoch": 3.2905982905982905, "grad_norm": 22.47858238220215, "learning_rate": 8.354700854700855e-06, "loss": 3.5437, "step": 385 }, { "epoch": 3.299145299145299, "grad_norm": 24.14532470703125, "learning_rate": 8.350427350427352e-06, "loss": 3.696, "step": 386 }, { "epoch": 3.3076923076923075, "grad_norm": 31.457847595214844, "learning_rate": 8.346153846153847e-06, "loss": 4.3065, "step": 387 }, { "epoch": 3.316239316239316, "grad_norm": 24.503095626831055, "learning_rate": 8.341880341880344e-06, "loss": 3.4798, "step": 388 }, { "epoch": 3.324786324786325, "grad_norm": 19.798818588256836, "learning_rate": 8.337606837606839e-06, "loss": 3.5323, "step": 389 }, { "epoch": 3.3333333333333335, "grad_norm": 22.023189544677734, "learning_rate": 8.333333333333334e-06, "loss": 3.4088, "step": 390 }, { "epoch": 3.341880341880342, "grad_norm": 17.314960479736328, "learning_rate": 8.32905982905983e-06, "loss": 3.2462, "step": 391 }, { "epoch": 3.3504273504273505, "grad_norm": 22.714536666870117, "learning_rate": 8.324786324786326e-06, "loss": 3.7863, "step": 392 }, { "epoch": 3.358974358974359, "grad_norm": 27.710514068603516, "learning_rate": 8.320512820512822e-06, "loss": 3.6032, "step": 393 }, { "epoch": 3.3675213675213675, "grad_norm": 23.35419464111328, "learning_rate": 8.316239316239317e-06, "loss": 3.5599, "step": 394 }, { "epoch": 3.376068376068376, "grad_norm": 24.0956974029541, "learning_rate": 8.311965811965812e-06, "loss": 3.5186, "step": 395 }, { "epoch": 3.3846153846153846, "grad_norm": 22.09107780456543, "learning_rate": 8.307692307692309e-06, "loss": 3.4843, "step": 396 }, { "epoch": 3.393162393162393, "grad_norm": 23.956623077392578, "learning_rate": 8.303418803418804e-06, "loss": 3.1625, "step": 397 }, { "epoch": 3.4017094017094016, "grad_norm": 18.875917434692383, "learning_rate": 8.299145299145301e-06, "loss": 3.3494, "step": 398 }, { "epoch": 3.41025641025641, "grad_norm": 33.475467681884766, "learning_rate": 8.294871794871796e-06, "loss": 3.9247, "step": 399 }, { "epoch": 3.4188034188034186, "grad_norm": 16.28295135498047, "learning_rate": 8.290598290598293e-06, "loss": 3.7446, "step": 400 }, { "epoch": 3.427350427350427, "grad_norm": 24.205049514770508, "learning_rate": 8.286324786324788e-06, "loss": 3.343, "step": 401 }, { "epoch": 3.435897435897436, "grad_norm": 21.21460723876953, "learning_rate": 8.282051282051283e-06, "loss": 3.2437, "step": 402 }, { "epoch": 3.4444444444444446, "grad_norm": 36.8713264465332, "learning_rate": 8.277777777777778e-06, "loss": 3.5009, "step": 403 }, { "epoch": 3.452991452991453, "grad_norm": 26.85513687133789, "learning_rate": 8.273504273504273e-06, "loss": 3.7271, "step": 404 }, { "epoch": 3.4615384615384617, "grad_norm": 18.184600830078125, "learning_rate": 8.26923076923077e-06, "loss": 3.2216, "step": 405 }, { "epoch": 3.47008547008547, "grad_norm": 27.03692054748535, "learning_rate": 8.264957264957265e-06, "loss": 3.516, "step": 406 }, { "epoch": 3.4786324786324787, "grad_norm": 20.63736915588379, "learning_rate": 8.260683760683761e-06, "loss": 3.1349, "step": 407 }, { "epoch": 3.4871794871794872, "grad_norm": 22.467845916748047, "learning_rate": 8.256410256410256e-06, "loss": 3.3878, "step": 408 }, { "epoch": 3.4957264957264957, "grad_norm": 21.25887107849121, "learning_rate": 8.252136752136753e-06, "loss": 3.8298, "step": 409 }, { "epoch": 3.5042735042735043, "grad_norm": 47.3256721496582, "learning_rate": 8.247863247863248e-06, "loss": 3.5321, "step": 410 }, { "epoch": 3.5128205128205128, "grad_norm": 22.103790283203125, "learning_rate": 8.243589743589743e-06, "loss": 3.335, "step": 411 }, { "epoch": 3.5213675213675213, "grad_norm": 25.779077529907227, "learning_rate": 8.23931623931624e-06, "loss": 3.5047, "step": 412 }, { "epoch": 3.52991452991453, "grad_norm": 22.78207778930664, "learning_rate": 8.235042735042735e-06, "loss": 3.3827, "step": 413 }, { "epoch": 3.5384615384615383, "grad_norm": 22.41836166381836, "learning_rate": 8.230769230769232e-06, "loss": 3.4521, "step": 414 }, { "epoch": 3.547008547008547, "grad_norm": 60.29216384887695, "learning_rate": 8.226495726495727e-06, "loss": 3.4598, "step": 415 }, { "epoch": 3.5555555555555554, "grad_norm": 25.27474021911621, "learning_rate": 8.222222222222222e-06, "loss": 3.7443, "step": 416 }, { "epoch": 3.564102564102564, "grad_norm": 25.297466278076172, "learning_rate": 8.217948717948719e-06, "loss": 3.3123, "step": 417 }, { "epoch": 3.5726495726495724, "grad_norm": 28.5858154296875, "learning_rate": 8.213675213675214e-06, "loss": 3.1801, "step": 418 }, { "epoch": 3.5811965811965814, "grad_norm": 20.05567741394043, "learning_rate": 8.20940170940171e-06, "loss": 3.7242, "step": 419 }, { "epoch": 3.58974358974359, "grad_norm": 32.33693313598633, "learning_rate": 8.205128205128205e-06, "loss": 3.3587, "step": 420 }, { "epoch": 3.5982905982905984, "grad_norm": 36.1716194152832, "learning_rate": 8.200854700854702e-06, "loss": 3.1573, "step": 421 }, { "epoch": 3.606837606837607, "grad_norm": 33.39027404785156, "learning_rate": 8.196581196581197e-06, "loss": 3.098, "step": 422 }, { "epoch": 3.6153846153846154, "grad_norm": 28.4794864654541, "learning_rate": 8.192307692307692e-06, "loss": 3.6403, "step": 423 }, { "epoch": 3.623931623931624, "grad_norm": 29.702611923217773, "learning_rate": 8.188034188034189e-06, "loss": 3.2569, "step": 424 }, { "epoch": 3.6324786324786325, "grad_norm": 24.73663902282715, "learning_rate": 8.183760683760684e-06, "loss": 3.0508, "step": 425 }, { "epoch": 3.641025641025641, "grad_norm": 29.606807708740234, "learning_rate": 8.17948717948718e-06, "loss": 3.2524, "step": 426 }, { "epoch": 3.6495726495726495, "grad_norm": 22.721933364868164, "learning_rate": 8.175213675213676e-06, "loss": 3.2583, "step": 427 }, { "epoch": 3.658119658119658, "grad_norm": 25.009403228759766, "learning_rate": 8.17094017094017e-06, "loss": 3.0678, "step": 428 }, { "epoch": 3.6666666666666665, "grad_norm": 25.776636123657227, "learning_rate": 8.166666666666668e-06, "loss": 3.1676, "step": 429 }, { "epoch": 3.6752136752136755, "grad_norm": 28.210241317749023, "learning_rate": 8.162393162393163e-06, "loss": 3.2869, "step": 430 }, { "epoch": 3.683760683760684, "grad_norm": 26.29328155517578, "learning_rate": 8.15811965811966e-06, "loss": 3.3618, "step": 431 }, { "epoch": 3.6923076923076925, "grad_norm": 19.813465118408203, "learning_rate": 8.153846153846154e-06, "loss": 3.0655, "step": 432 }, { "epoch": 3.700854700854701, "grad_norm": 29.718812942504883, "learning_rate": 8.14957264957265e-06, "loss": 3.1538, "step": 433 }, { "epoch": 3.7094017094017095, "grad_norm": 30.629135131835938, "learning_rate": 8.145299145299146e-06, "loss": 3.3252, "step": 434 }, { "epoch": 3.717948717948718, "grad_norm": 27.716825485229492, "learning_rate": 8.141025641025641e-06, "loss": 3.4083, "step": 435 }, { "epoch": 3.7264957264957266, "grad_norm": 39.23820877075195, "learning_rate": 8.136752136752138e-06, "loss": 3.3074, "step": 436 }, { "epoch": 3.735042735042735, "grad_norm": 34.516422271728516, "learning_rate": 8.132478632478633e-06, "loss": 3.3529, "step": 437 }, { "epoch": 3.7435897435897436, "grad_norm": 41.98606872558594, "learning_rate": 8.12820512820513e-06, "loss": 3.248, "step": 438 }, { "epoch": 3.752136752136752, "grad_norm": 27.99711799621582, "learning_rate": 8.123931623931625e-06, "loss": 3.3054, "step": 439 }, { "epoch": 3.7606837606837606, "grad_norm": 25.21969985961914, "learning_rate": 8.11965811965812e-06, "loss": 2.8518, "step": 440 }, { "epoch": 3.769230769230769, "grad_norm": 29.14298439025879, "learning_rate": 8.115384615384617e-06, "loss": 3.0063, "step": 441 }, { "epoch": 3.7777777777777777, "grad_norm": 27.040063858032227, "learning_rate": 8.111111111111112e-06, "loss": 3.3066, "step": 442 }, { "epoch": 3.786324786324786, "grad_norm": 365.3290100097656, "learning_rate": 8.106837606837608e-06, "loss": 3.8057, "step": 443 }, { "epoch": 3.7948717948717947, "grad_norm": 32.89745330810547, "learning_rate": 8.102564102564103e-06, "loss": 3.0903, "step": 444 }, { "epoch": 3.8034188034188032, "grad_norm": 29.448022842407227, "learning_rate": 8.098290598290598e-06, "loss": 3.2723, "step": 445 }, { "epoch": 3.8119658119658117, "grad_norm": 27.838123321533203, "learning_rate": 8.094017094017095e-06, "loss": 3.2903, "step": 446 }, { "epoch": 3.8205128205128203, "grad_norm": 29.047847747802734, "learning_rate": 8.08974358974359e-06, "loss": 2.9048, "step": 447 }, { "epoch": 3.8290598290598292, "grad_norm": 28.666589736938477, "learning_rate": 8.085470085470087e-06, "loss": 3.2186, "step": 448 }, { "epoch": 3.8376068376068377, "grad_norm": 31.796804428100586, "learning_rate": 8.081196581196582e-06, "loss": 3.2668, "step": 449 }, { "epoch": 3.8461538461538463, "grad_norm": 22.665220260620117, "learning_rate": 8.076923076923077e-06, "loss": 3.0965, "step": 450 }, { "epoch": 3.8547008547008548, "grad_norm": 32.7353630065918, "learning_rate": 8.072649572649574e-06, "loss": 3.1759, "step": 451 }, { "epoch": 3.8632478632478633, "grad_norm": 32.95683670043945, "learning_rate": 8.068376068376069e-06, "loss": 2.9589, "step": 452 }, { "epoch": 3.871794871794872, "grad_norm": 30.04659652709961, "learning_rate": 8.064102564102566e-06, "loss": 3.4709, "step": 453 }, { "epoch": 3.8803418803418803, "grad_norm": 30.41158676147461, "learning_rate": 8.05982905982906e-06, "loss": 2.9385, "step": 454 }, { "epoch": 3.888888888888889, "grad_norm": 30.059635162353516, "learning_rate": 8.055555555555557e-06, "loss": 3.0099, "step": 455 }, { "epoch": 3.8974358974358974, "grad_norm": 24.83198356628418, "learning_rate": 8.051282051282052e-06, "loss": 2.9783, "step": 456 }, { "epoch": 3.905982905982906, "grad_norm": 25.38758087158203, "learning_rate": 8.047008547008547e-06, "loss": 3.0275, "step": 457 }, { "epoch": 3.9145299145299144, "grad_norm": 25.21868133544922, "learning_rate": 8.042735042735044e-06, "loss": 2.9096, "step": 458 }, { "epoch": 3.9230769230769234, "grad_norm": 32.02922058105469, "learning_rate": 8.03846153846154e-06, "loss": 3.059, "step": 459 }, { "epoch": 3.931623931623932, "grad_norm": 22.240680694580078, "learning_rate": 8.034188034188036e-06, "loss": 2.9473, "step": 460 }, { "epoch": 3.9401709401709404, "grad_norm": 27.61838150024414, "learning_rate": 8.029914529914531e-06, "loss": 2.4506, "step": 461 }, { "epoch": 3.948717948717949, "grad_norm": 27.742216110229492, "learning_rate": 8.025641025641026e-06, "loss": 2.9082, "step": 462 }, { "epoch": 3.9572649572649574, "grad_norm": 29.965059280395508, "learning_rate": 8.021367521367523e-06, "loss": 2.8268, "step": 463 }, { "epoch": 3.965811965811966, "grad_norm": 31.429990768432617, "learning_rate": 8.017094017094018e-06, "loss": 3.1805, "step": 464 }, { "epoch": 3.9743589743589745, "grad_norm": 31.162532806396484, "learning_rate": 8.012820512820515e-06, "loss": 2.64, "step": 465 }, { "epoch": 3.982905982905983, "grad_norm": 28.240577697753906, "learning_rate": 8.00854700854701e-06, "loss": 3.249, "step": 466 }, { "epoch": 3.9914529914529915, "grad_norm": 48.52914810180664, "learning_rate": 8.004273504273505e-06, "loss": 3.1619, "step": 467 }, { "epoch": 4.0, "grad_norm": 36.80685806274414, "learning_rate": 8.000000000000001e-06, "loss": 3.5337, "step": 468 }, { "epoch": 4.0, "eval_loss": 2.1340389251708984, "eval_runtime": 9.2211, "eval_samples_per_second": 50.536, "eval_steps_per_second": 6.398, "step": 468 }, { "epoch": 4.0085470085470085, "grad_norm": 45.45211410522461, "learning_rate": 7.995726495726496e-06, "loss": 3.5596, "step": 469 }, { "epoch": 4.017094017094017, "grad_norm": 32.711669921875, "learning_rate": 7.991452991452993e-06, "loss": 2.9362, "step": 470 }, { "epoch": 4.0256410256410255, "grad_norm": 26.151872634887695, "learning_rate": 7.987179487179488e-06, "loss": 2.6796, "step": 471 }, { "epoch": 4.034188034188034, "grad_norm": 33.02329635620117, "learning_rate": 7.982905982905985e-06, "loss": 2.7147, "step": 472 }, { "epoch": 4.042735042735043, "grad_norm": 31.1684513092041, "learning_rate": 7.97863247863248e-06, "loss": 3.2356, "step": 473 }, { "epoch": 4.051282051282051, "grad_norm": 37.0435905456543, "learning_rate": 7.974358974358975e-06, "loss": 2.9954, "step": 474 }, { "epoch": 4.05982905982906, "grad_norm": 25.989973068237305, "learning_rate": 7.970085470085472e-06, "loss": 3.2143, "step": 475 }, { "epoch": 4.068376068376068, "grad_norm": 27.048690795898438, "learning_rate": 7.965811965811967e-06, "loss": 2.5087, "step": 476 }, { "epoch": 4.076923076923077, "grad_norm": 26.857696533203125, "learning_rate": 7.961538461538462e-06, "loss": 2.6466, "step": 477 }, { "epoch": 4.085470085470085, "grad_norm": 33.342193603515625, "learning_rate": 7.957264957264957e-06, "loss": 2.6591, "step": 478 }, { "epoch": 4.094017094017094, "grad_norm": 64.21253967285156, "learning_rate": 7.952991452991454e-06, "loss": 3.0295, "step": 479 }, { "epoch": 4.102564102564102, "grad_norm": 31.240161895751953, "learning_rate": 7.948717948717949e-06, "loss": 2.9374, "step": 480 }, { "epoch": 4.111111111111111, "grad_norm": 29.338851928710938, "learning_rate": 7.944444444444445e-06, "loss": 2.5019, "step": 481 }, { "epoch": 4.119658119658119, "grad_norm": 36.79518127441406, "learning_rate": 7.94017094017094e-06, "loss": 2.7649, "step": 482 }, { "epoch": 4.128205128205128, "grad_norm": 37.036739349365234, "learning_rate": 7.935897435897435e-06, "loss": 2.5182, "step": 483 }, { "epoch": 4.136752136752137, "grad_norm": 42.571163177490234, "learning_rate": 7.931623931623932e-06, "loss": 2.767, "step": 484 }, { "epoch": 4.145299145299146, "grad_norm": 33.72893524169922, "learning_rate": 7.927350427350427e-06, "loss": 3.1404, "step": 485 }, { "epoch": 4.153846153846154, "grad_norm": 27.06032943725586, "learning_rate": 7.923076923076924e-06, "loss": 2.6825, "step": 486 }, { "epoch": 4.162393162393163, "grad_norm": 31.8147029876709, "learning_rate": 7.918803418803419e-06, "loss": 2.5129, "step": 487 }, { "epoch": 4.170940170940171, "grad_norm": 35.681793212890625, "learning_rate": 7.914529914529914e-06, "loss": 2.4793, "step": 488 }, { "epoch": 4.17948717948718, "grad_norm": 159.4467315673828, "learning_rate": 7.91025641025641e-06, "loss": 3.5531, "step": 489 }, { "epoch": 4.188034188034188, "grad_norm": 40.12252426147461, "learning_rate": 7.905982905982906e-06, "loss": 2.7095, "step": 490 }, { "epoch": 4.196581196581197, "grad_norm": 27.05786895751953, "learning_rate": 7.901709401709403e-06, "loss": 2.5984, "step": 491 }, { "epoch": 4.205128205128205, "grad_norm": 24.31035614013672, "learning_rate": 7.897435897435898e-06, "loss": 2.89, "step": 492 }, { "epoch": 4.213675213675214, "grad_norm": 277.16156005859375, "learning_rate": 7.893162393162394e-06, "loss": 3.8076, "step": 493 }, { "epoch": 4.222222222222222, "grad_norm": 29.722867965698242, "learning_rate": 7.88888888888889e-06, "loss": 2.4189, "step": 494 }, { "epoch": 4.230769230769231, "grad_norm": 40.47605514526367, "learning_rate": 7.884615384615384e-06, "loss": 2.6225, "step": 495 }, { "epoch": 4.239316239316239, "grad_norm": 29.136499404907227, "learning_rate": 7.880341880341881e-06, "loss": 2.5223, "step": 496 }, { "epoch": 4.247863247863248, "grad_norm": 78.86258697509766, "learning_rate": 7.876068376068376e-06, "loss": 2.6587, "step": 497 }, { "epoch": 4.256410256410256, "grad_norm": 24.473243713378906, "learning_rate": 7.871794871794873e-06, "loss": 2.456, "step": 498 }, { "epoch": 4.264957264957265, "grad_norm": 80.45248413085938, "learning_rate": 7.867521367521368e-06, "loss": 3.1893, "step": 499 }, { "epoch": 4.273504273504273, "grad_norm": 194.2708282470703, "learning_rate": 7.863247863247863e-06, "loss": 3.8294, "step": 500 }, { "epoch": 4.282051282051282, "grad_norm": 27.74302101135254, "learning_rate": 7.85897435897436e-06, "loss": 2.2506, "step": 501 }, { "epoch": 4.2905982905982905, "grad_norm": 21.90385627746582, "learning_rate": 7.854700854700855e-06, "loss": 3.0985, "step": 502 }, { "epoch": 4.299145299145299, "grad_norm": 50.30342102050781, "learning_rate": 7.850427350427352e-06, "loss": 2.526, "step": 503 }, { "epoch": 4.3076923076923075, "grad_norm": 28.666881561279297, "learning_rate": 7.846153846153847e-06, "loss": 2.4213, "step": 504 }, { "epoch": 4.316239316239316, "grad_norm": 27.927257537841797, "learning_rate": 7.841880341880342e-06, "loss": 2.6731, "step": 505 }, { "epoch": 4.3247863247863245, "grad_norm": 36.12032699584961, "learning_rate": 7.837606837606838e-06, "loss": 2.3323, "step": 506 }, { "epoch": 4.333333333333333, "grad_norm": 31.632287979125977, "learning_rate": 7.833333333333333e-06, "loss": 2.2966, "step": 507 }, { "epoch": 4.3418803418803416, "grad_norm": 26.511537551879883, "learning_rate": 7.82905982905983e-06, "loss": 2.3422, "step": 508 }, { "epoch": 4.35042735042735, "grad_norm": 31.429107666015625, "learning_rate": 7.824786324786325e-06, "loss": 2.6764, "step": 509 }, { "epoch": 4.358974358974359, "grad_norm": 29.8817138671875, "learning_rate": 7.820512820512822e-06, "loss": 2.4358, "step": 510 }, { "epoch": 4.367521367521368, "grad_norm": 29.293964385986328, "learning_rate": 7.816239316239317e-06, "loss": 2.504, "step": 511 }, { "epoch": 4.3760683760683765, "grad_norm": 23.624290466308594, "learning_rate": 7.811965811965812e-06, "loss": 2.0312, "step": 512 }, { "epoch": 4.384615384615385, "grad_norm": 25.336505889892578, "learning_rate": 7.807692307692309e-06, "loss": 2.1045, "step": 513 }, { "epoch": 4.3931623931623935, "grad_norm": 24.755443572998047, "learning_rate": 7.803418803418804e-06, "loss": 2.5754, "step": 514 }, { "epoch": 4.401709401709402, "grad_norm": 29.29696273803711, "learning_rate": 7.7991452991453e-06, "loss": 2.562, "step": 515 }, { "epoch": 4.410256410256411, "grad_norm": 28.054868698120117, "learning_rate": 7.794871794871796e-06, "loss": 1.9815, "step": 516 }, { "epoch": 4.418803418803419, "grad_norm": 20.894853591918945, "learning_rate": 7.79059829059829e-06, "loss": 2.5668, "step": 517 }, { "epoch": 4.427350427350428, "grad_norm": 19.532094955444336, "learning_rate": 7.786324786324787e-06, "loss": 2.2314, "step": 518 }, { "epoch": 4.435897435897436, "grad_norm": 27.919715881347656, "learning_rate": 7.782051282051282e-06, "loss": 1.9523, "step": 519 }, { "epoch": 4.444444444444445, "grad_norm": 21.91543960571289, "learning_rate": 7.77777777777778e-06, "loss": 2.559, "step": 520 }, { "epoch": 4.452991452991453, "grad_norm": 26.20106315612793, "learning_rate": 7.773504273504274e-06, "loss": 2.367, "step": 521 }, { "epoch": 4.461538461538462, "grad_norm": 23.455419540405273, "learning_rate": 7.76923076923077e-06, "loss": 2.4132, "step": 522 }, { "epoch": 4.47008547008547, "grad_norm": 49.62391662597656, "learning_rate": 7.764957264957266e-06, "loss": 1.8896, "step": 523 }, { "epoch": 4.478632478632479, "grad_norm": 25.721101760864258, "learning_rate": 7.760683760683761e-06, "loss": 1.9918, "step": 524 }, { "epoch": 4.487179487179487, "grad_norm": 22.906694412231445, "learning_rate": 7.756410256410258e-06, "loss": 2.1819, "step": 525 }, { "epoch": 4.495726495726496, "grad_norm": 28.5809268951416, "learning_rate": 7.752136752136753e-06, "loss": 2.0516, "step": 526 }, { "epoch": 4.504273504273504, "grad_norm": 26.47665023803711, "learning_rate": 7.74786324786325e-06, "loss": 2.0081, "step": 527 }, { "epoch": 4.512820512820513, "grad_norm": 27.221372604370117, "learning_rate": 7.743589743589745e-06, "loss": 2.0414, "step": 528 }, { "epoch": 4.521367521367521, "grad_norm": 27.931568145751953, "learning_rate": 7.73931623931624e-06, "loss": 2.0335, "step": 529 }, { "epoch": 4.52991452991453, "grad_norm": 25.567049026489258, "learning_rate": 7.735042735042736e-06, "loss": 2.0129, "step": 530 }, { "epoch": 4.538461538461538, "grad_norm": 30.897083282470703, "learning_rate": 7.730769230769231e-06, "loss": 2.3941, "step": 531 }, { "epoch": 4.547008547008547, "grad_norm": 21.92133903503418, "learning_rate": 7.726495726495728e-06, "loss": 2.2563, "step": 532 }, { "epoch": 4.555555555555555, "grad_norm": 27.053892135620117, "learning_rate": 7.722222222222223e-06, "loss": 2.2463, "step": 533 }, { "epoch": 4.564102564102564, "grad_norm": 29.3230037689209, "learning_rate": 7.717948717948718e-06, "loss": 1.9167, "step": 534 }, { "epoch": 4.572649572649572, "grad_norm": 36.06028747558594, "learning_rate": 7.713675213675215e-06, "loss": 1.9106, "step": 535 }, { "epoch": 4.581196581196581, "grad_norm": 24.622135162353516, "learning_rate": 7.70940170940171e-06, "loss": 2.2899, "step": 536 }, { "epoch": 4.589743589743589, "grad_norm": 21.3137264251709, "learning_rate": 7.705128205128207e-06, "loss": 2.0166, "step": 537 }, { "epoch": 4.598290598290598, "grad_norm": 21.939279556274414, "learning_rate": 7.700854700854702e-06, "loss": 2.3319, "step": 538 }, { "epoch": 4.6068376068376065, "grad_norm": 25.496994018554688, "learning_rate": 7.696581196581197e-06, "loss": 2.6162, "step": 539 }, { "epoch": 4.615384615384615, "grad_norm": 24.095666885375977, "learning_rate": 7.692307692307694e-06, "loss": 2.2863, "step": 540 }, { "epoch": 4.6239316239316235, "grad_norm": 31.96511459350586, "learning_rate": 7.688034188034189e-06, "loss": 2.0261, "step": 541 }, { "epoch": 4.632478632478632, "grad_norm": 22.66115379333496, "learning_rate": 7.683760683760685e-06, "loss": 2.2786, "step": 542 }, { "epoch": 4.641025641025641, "grad_norm": 23.661611557006836, "learning_rate": 7.67948717948718e-06, "loss": 1.7113, "step": 543 }, { "epoch": 4.64957264957265, "grad_norm": 18.64708709716797, "learning_rate": 7.675213675213677e-06, "loss": 2.1389, "step": 544 }, { "epoch": 4.6581196581196584, "grad_norm": 20.55480194091797, "learning_rate": 7.670940170940172e-06, "loss": 2.0831, "step": 545 }, { "epoch": 4.666666666666667, "grad_norm": 27.876964569091797, "learning_rate": 7.666666666666667e-06, "loss": 2.0358, "step": 546 }, { "epoch": 4.6752136752136755, "grad_norm": 20.236507415771484, "learning_rate": 7.662393162393164e-06, "loss": 1.5596, "step": 547 }, { "epoch": 4.683760683760684, "grad_norm": 23.360782623291016, "learning_rate": 7.658119658119659e-06, "loss": 1.9623, "step": 548 }, { "epoch": 4.6923076923076925, "grad_norm": 41.7568359375, "learning_rate": 7.653846153846154e-06, "loss": 1.9884, "step": 549 }, { "epoch": 4.700854700854701, "grad_norm": 28.651065826416016, "learning_rate": 7.649572649572649e-06, "loss": 2.1491, "step": 550 }, { "epoch": 4.7094017094017095, "grad_norm": 23.636432647705078, "learning_rate": 7.645299145299146e-06, "loss": 1.9352, "step": 551 }, { "epoch": 4.717948717948718, "grad_norm": 25.313966751098633, "learning_rate": 7.641025641025641e-06, "loss": 2.4112, "step": 552 }, { "epoch": 4.726495726495727, "grad_norm": 32.4974479675293, "learning_rate": 7.636752136752138e-06, "loss": 1.7017, "step": 553 }, { "epoch": 4.735042735042735, "grad_norm": 20.644481658935547, "learning_rate": 7.632478632478633e-06, "loss": 1.6904, "step": 554 }, { "epoch": 4.743589743589744, "grad_norm": 26.526721954345703, "learning_rate": 7.6282051282051286e-06, "loss": 2.1666, "step": 555 }, { "epoch": 4.752136752136752, "grad_norm": 23.375839233398438, "learning_rate": 7.6239316239316244e-06, "loss": 1.5555, "step": 556 }, { "epoch": 4.760683760683761, "grad_norm": 29.890501022338867, "learning_rate": 7.6196581196581195e-06, "loss": 2.0195, "step": 557 }, { "epoch": 4.769230769230769, "grad_norm": 687.5745239257812, "learning_rate": 7.615384615384615e-06, "loss": 2.4286, "step": 558 }, { "epoch": 4.777777777777778, "grad_norm": 22.844587326049805, "learning_rate": 7.611111111111111e-06, "loss": 2.2335, "step": 559 }, { "epoch": 4.786324786324786, "grad_norm": 29.633562088012695, "learning_rate": 7.606837606837607e-06, "loss": 1.7579, "step": 560 }, { "epoch": 4.794871794871795, "grad_norm": 48.04582977294922, "learning_rate": 7.602564102564103e-06, "loss": 2.3846, "step": 561 }, { "epoch": 4.803418803418803, "grad_norm": 27.2290096282959, "learning_rate": 7.598290598290599e-06, "loss": 2.2234, "step": 562 }, { "epoch": 4.811965811965812, "grad_norm": 29.782209396362305, "learning_rate": 7.594017094017094e-06, "loss": 2.0365, "step": 563 }, { "epoch": 4.82051282051282, "grad_norm": 32.457061767578125, "learning_rate": 7.58974358974359e-06, "loss": 2.0451, "step": 564 }, { "epoch": 4.829059829059829, "grad_norm": 22.089427947998047, "learning_rate": 7.585470085470086e-06, "loss": 1.7105, "step": 565 }, { "epoch": 4.837606837606837, "grad_norm": 23.105140686035156, "learning_rate": 7.581196581196582e-06, "loss": 1.6817, "step": 566 }, { "epoch": 4.846153846153846, "grad_norm": 24.513713836669922, "learning_rate": 7.5769230769230775e-06, "loss": 1.9553, "step": 567 }, { "epoch": 4.854700854700854, "grad_norm": 22.187759399414062, "learning_rate": 7.572649572649573e-06, "loss": 2.0309, "step": 568 }, { "epoch": 4.863247863247864, "grad_norm": 53.56728744506836, "learning_rate": 7.5683760683760685e-06, "loss": 2.6508, "step": 569 }, { "epoch": 4.871794871794872, "grad_norm": 27.983978271484375, "learning_rate": 7.564102564102564e-06, "loss": 2.1942, "step": 570 }, { "epoch": 4.880341880341881, "grad_norm": 25.610252380371094, "learning_rate": 7.55982905982906e-06, "loss": 1.4151, "step": 571 }, { "epoch": 4.888888888888889, "grad_norm": 19.856618881225586, "learning_rate": 7.555555555555556e-06, "loss": 1.6968, "step": 572 }, { "epoch": 4.897435897435898, "grad_norm": 20.288606643676758, "learning_rate": 7.551282051282052e-06, "loss": 1.7494, "step": 573 }, { "epoch": 4.905982905982906, "grad_norm": 23.206768035888672, "learning_rate": 7.547008547008547e-06, "loss": 2.1255, "step": 574 }, { "epoch": 4.914529914529915, "grad_norm": 21.275257110595703, "learning_rate": 7.542735042735043e-06, "loss": 1.7442, "step": 575 }, { "epoch": 4.923076923076923, "grad_norm": 22.635417938232422, "learning_rate": 7.538461538461539e-06, "loss": 1.9129, "step": 576 }, { "epoch": 4.931623931623932, "grad_norm": 21.440109252929688, "learning_rate": 7.534188034188035e-06, "loss": 2.0056, "step": 577 }, { "epoch": 4.94017094017094, "grad_norm": 20.939407348632812, "learning_rate": 7.529914529914531e-06, "loss": 1.7231, "step": 578 }, { "epoch": 4.948717948717949, "grad_norm": 16.189861297607422, "learning_rate": 7.5256410256410265e-06, "loss": 1.4255, "step": 579 }, { "epoch": 4.957264957264957, "grad_norm": 23.6302547454834, "learning_rate": 7.521367521367522e-06, "loss": 1.6748, "step": 580 }, { "epoch": 4.965811965811966, "grad_norm": 22.29713249206543, "learning_rate": 7.5170940170940175e-06, "loss": 1.5285, "step": 581 }, { "epoch": 4.9743589743589745, "grad_norm": 22.831275939941406, "learning_rate": 7.512820512820513e-06, "loss": 1.7742, "step": 582 }, { "epoch": 4.982905982905983, "grad_norm": 630.5899658203125, "learning_rate": 7.508547008547009e-06, "loss": 2.8598, "step": 583 }, { "epoch": 4.9914529914529915, "grad_norm": 22.880647659301758, "learning_rate": 7.504273504273505e-06, "loss": 1.6231, "step": 584 }, { "epoch": 5.0, "grad_norm": 21.379072189331055, "learning_rate": 7.500000000000001e-06, "loss": 1.3506, "step": 585 }, { "epoch": 5.0, "eval_loss": 0.8325614333152771, "eval_runtime": 9.2303, "eval_samples_per_second": 50.486, "eval_steps_per_second": 6.392, "step": 585 }, { "epoch": 5.0085470085470085, "grad_norm": 23.968698501586914, "learning_rate": 7.495726495726496e-06, "loss": 1.4263, "step": 586 }, { "epoch": 5.017094017094017, "grad_norm": 24.880769729614258, "learning_rate": 7.491452991452992e-06, "loss": 1.4994, "step": 587 }, { "epoch": 5.0256410256410255, "grad_norm": 23.4547176361084, "learning_rate": 7.487179487179488e-06, "loss": 1.671, "step": 588 }, { "epoch": 5.034188034188034, "grad_norm": 17.382152557373047, "learning_rate": 7.482905982905984e-06, "loss": 1.3935, "step": 589 }, { "epoch": 5.042735042735043, "grad_norm": 19.607717514038086, "learning_rate": 7.47863247863248e-06, "loss": 1.5652, "step": 590 }, { "epoch": 5.051282051282051, "grad_norm": 27.735240936279297, "learning_rate": 7.474358974358975e-06, "loss": 1.5491, "step": 591 }, { "epoch": 5.05982905982906, "grad_norm": 20.493412017822266, "learning_rate": 7.4700854700854706e-06, "loss": 1.9229, "step": 592 }, { "epoch": 5.068376068376068, "grad_norm": 20.492137908935547, "learning_rate": 7.4658119658119665e-06, "loss": 1.5066, "step": 593 }, { "epoch": 5.076923076923077, "grad_norm": 27.650495529174805, "learning_rate": 7.461538461538462e-06, "loss": 1.4228, "step": 594 }, { "epoch": 5.085470085470085, "grad_norm": 22.38190269470215, "learning_rate": 7.457264957264958e-06, "loss": 1.6243, "step": 595 }, { "epoch": 5.094017094017094, "grad_norm": 22.862489700317383, "learning_rate": 7.452991452991454e-06, "loss": 1.9224, "step": 596 }, { "epoch": 5.102564102564102, "grad_norm": 17.368051528930664, "learning_rate": 7.448717948717949e-06, "loss": 1.3642, "step": 597 }, { "epoch": 5.111111111111111, "grad_norm": 20.587018966674805, "learning_rate": 7.444444444444445e-06, "loss": 1.471, "step": 598 }, { "epoch": 5.119658119658119, "grad_norm": 18.502887725830078, "learning_rate": 7.440170940170941e-06, "loss": 1.9841, "step": 599 }, { "epoch": 5.128205128205128, "grad_norm": 21.305294036865234, "learning_rate": 7.435897435897437e-06, "loss": 1.8564, "step": 600 }, { "epoch": 5.136752136752137, "grad_norm": 20.61264419555664, "learning_rate": 7.431623931623933e-06, "loss": 1.3554, "step": 601 }, { "epoch": 5.145299145299146, "grad_norm": 19.05555534362793, "learning_rate": 7.427350427350429e-06, "loss": 1.6612, "step": 602 }, { "epoch": 5.153846153846154, "grad_norm": 20.392446517944336, "learning_rate": 7.423076923076924e-06, "loss": 1.5071, "step": 603 }, { "epoch": 5.162393162393163, "grad_norm": 22.007591247558594, "learning_rate": 7.4188034188034196e-06, "loss": 1.3356, "step": 604 }, { "epoch": 5.170940170940171, "grad_norm": 18.928104400634766, "learning_rate": 7.4145299145299155e-06, "loss": 1.6214, "step": 605 }, { "epoch": 5.17948717948718, "grad_norm": 21.151193618774414, "learning_rate": 7.410256410256411e-06, "loss": 1.5275, "step": 606 }, { "epoch": 5.188034188034188, "grad_norm": 16.272262573242188, "learning_rate": 7.405982905982907e-06, "loss": 1.2773, "step": 607 }, { "epoch": 5.196581196581197, "grad_norm": 21.59275245666504, "learning_rate": 7.401709401709402e-06, "loss": 1.3503, "step": 608 }, { "epoch": 5.205128205128205, "grad_norm": 84.31806182861328, "learning_rate": 7.397435897435898e-06, "loss": 1.8618, "step": 609 }, { "epoch": 5.213675213675214, "grad_norm": 20.374465942382812, "learning_rate": 7.393162393162394e-06, "loss": 1.6153, "step": 610 }, { "epoch": 5.222222222222222, "grad_norm": 18.569623947143555, "learning_rate": 7.38888888888889e-06, "loss": 1.7101, "step": 611 }, { "epoch": 5.230769230769231, "grad_norm": 19.51409339904785, "learning_rate": 7.384615384615386e-06, "loss": 1.5801, "step": 612 }, { "epoch": 5.239316239316239, "grad_norm": 19.45322608947754, "learning_rate": 7.380341880341882e-06, "loss": 1.1376, "step": 613 }, { "epoch": 5.247863247863248, "grad_norm": 23.474557876586914, "learning_rate": 7.376068376068377e-06, "loss": 1.442, "step": 614 }, { "epoch": 5.256410256410256, "grad_norm": 21.458847045898438, "learning_rate": 7.371794871794873e-06, "loss": 1.2769, "step": 615 }, { "epoch": 5.264957264957265, "grad_norm": 25.741121292114258, "learning_rate": 7.3675213675213686e-06, "loss": 1.3321, "step": 616 }, { "epoch": 5.273504273504273, "grad_norm": 15.394718170166016, "learning_rate": 7.3632478632478645e-06, "loss": 1.2335, "step": 617 }, { "epoch": 5.282051282051282, "grad_norm": 20.938871383666992, "learning_rate": 7.35897435897436e-06, "loss": 1.5741, "step": 618 }, { "epoch": 5.2905982905982905, "grad_norm": 19.348268508911133, "learning_rate": 7.354700854700856e-06, "loss": 1.2493, "step": 619 }, { "epoch": 5.299145299145299, "grad_norm": 25.26751708984375, "learning_rate": 7.350427350427351e-06, "loss": 1.5167, "step": 620 }, { "epoch": 5.3076923076923075, "grad_norm": 22.099227905273438, "learning_rate": 7.346153846153847e-06, "loss": 1.3269, "step": 621 }, { "epoch": 5.316239316239316, "grad_norm": 21.483428955078125, "learning_rate": 7.341880341880342e-06, "loss": 1.4249, "step": 622 }, { "epoch": 5.3247863247863245, "grad_norm": 20.089691162109375, "learning_rate": 7.337606837606837e-06, "loss": 1.351, "step": 623 }, { "epoch": 5.333333333333333, "grad_norm": 138.9898223876953, "learning_rate": 7.333333333333333e-06, "loss": 1.5682, "step": 624 }, { "epoch": 5.3418803418803416, "grad_norm": 16.808000564575195, "learning_rate": 7.329059829059829e-06, "loss": 1.4794, "step": 625 }, { "epoch": 5.35042735042735, "grad_norm": 18.58464813232422, "learning_rate": 7.324786324786325e-06, "loss": 1.4486, "step": 626 }, { "epoch": 5.358974358974359, "grad_norm": 15.074477195739746, "learning_rate": 7.320512820512821e-06, "loss": 1.3124, "step": 627 }, { "epoch": 5.367521367521368, "grad_norm": 15.800148963928223, "learning_rate": 7.316239316239317e-06, "loss": 1.7055, "step": 628 }, { "epoch": 5.3760683760683765, "grad_norm": 19.166179656982422, "learning_rate": 7.311965811965812e-06, "loss": 1.7306, "step": 629 }, { "epoch": 5.384615384615385, "grad_norm": 55.91648864746094, "learning_rate": 7.307692307692308e-06, "loss": 1.2376, "step": 630 }, { "epoch": 5.3931623931623935, "grad_norm": 16.606033325195312, "learning_rate": 7.3034188034188035e-06, "loss": 1.1159, "step": 631 }, { "epoch": 5.401709401709402, "grad_norm": 17.0134220123291, "learning_rate": 7.299145299145299e-06, "loss": 1.2124, "step": 632 }, { "epoch": 5.410256410256411, "grad_norm": 17.511932373046875, "learning_rate": 7.294871794871795e-06, "loss": 1.4221, "step": 633 }, { "epoch": 5.418803418803419, "grad_norm": 44.53416061401367, "learning_rate": 7.290598290598291e-06, "loss": 1.9583, "step": 634 }, { "epoch": 5.427350427350428, "grad_norm": 16.546630859375, "learning_rate": 7.286324786324786e-06, "loss": 1.1722, "step": 635 }, { "epoch": 5.435897435897436, "grad_norm": 39.90822982788086, "learning_rate": 7.282051282051282e-06, "loss": 1.7482, "step": 636 }, { "epoch": 5.444444444444445, "grad_norm": 16.186573028564453, "learning_rate": 7.277777777777778e-06, "loss": 1.3422, "step": 637 }, { "epoch": 5.452991452991453, "grad_norm": 18.84516143798828, "learning_rate": 7.273504273504274e-06, "loss": 1.3299, "step": 638 }, { "epoch": 5.461538461538462, "grad_norm": 14.620058059692383, "learning_rate": 7.26923076923077e-06, "loss": 1.0604, "step": 639 }, { "epoch": 5.47008547008547, "grad_norm": 16.5911865234375, "learning_rate": 7.264957264957266e-06, "loss": 1.1138, "step": 640 }, { "epoch": 5.478632478632479, "grad_norm": 15.44485092163086, "learning_rate": 7.260683760683761e-06, "loss": 1.435, "step": 641 }, { "epoch": 5.487179487179487, "grad_norm": 121.76724243164062, "learning_rate": 7.256410256410257e-06, "loss": 1.7167, "step": 642 }, { "epoch": 5.495726495726496, "grad_norm": 1996.141357421875, "learning_rate": 7.2521367521367525e-06, "loss": 4.0296, "step": 643 }, { "epoch": 5.504273504273504, "grad_norm": 15.072067260742188, "learning_rate": 7.247863247863248e-06, "loss": 1.0455, "step": 644 }, { "epoch": 5.512820512820513, "grad_norm": 16.684345245361328, "learning_rate": 7.243589743589744e-06, "loss": 1.7565, "step": 645 }, { "epoch": 5.521367521367521, "grad_norm": 15.515148162841797, "learning_rate": 7.239316239316239e-06, "loss": 1.4601, "step": 646 }, { "epoch": 5.52991452991453, "grad_norm": 20.1015625, "learning_rate": 7.235042735042735e-06, "loss": 1.073, "step": 647 }, { "epoch": 5.538461538461538, "grad_norm": 67.10873413085938, "learning_rate": 7.230769230769231e-06, "loss": 1.8586, "step": 648 }, { "epoch": 5.547008547008547, "grad_norm": 13.775193214416504, "learning_rate": 7.226495726495727e-06, "loss": 1.2891, "step": 649 }, { "epoch": 5.555555555555555, "grad_norm": 14.612048149108887, "learning_rate": 7.222222222222223e-06, "loss": 1.033, "step": 650 }, { "epoch": 5.564102564102564, "grad_norm": 14.512042999267578, "learning_rate": 7.217948717948719e-06, "loss": 1.1446, "step": 651 }, { "epoch": 5.572649572649572, "grad_norm": 13.720820426940918, "learning_rate": 7.213675213675214e-06, "loss": 1.1246, "step": 652 }, { "epoch": 5.581196581196581, "grad_norm": 16.548046112060547, "learning_rate": 7.20940170940171e-06, "loss": 1.3162, "step": 653 }, { "epoch": 5.589743589743589, "grad_norm": 20.535181045532227, "learning_rate": 7.205128205128206e-06, "loss": 1.3019, "step": 654 }, { "epoch": 5.598290598290598, "grad_norm": 14.317465782165527, "learning_rate": 7.2008547008547015e-06, "loss": 1.5447, "step": 655 }, { "epoch": 5.6068376068376065, "grad_norm": 16.23088836669922, "learning_rate": 7.196581196581197e-06, "loss": 1.2701, "step": 656 }, { "epoch": 5.615384615384615, "grad_norm": 13.754173278808594, "learning_rate": 7.192307692307693e-06, "loss": 1.2218, "step": 657 }, { "epoch": 5.6239316239316235, "grad_norm": 75.77688598632812, "learning_rate": 7.188034188034188e-06, "loss": 1.7547, "step": 658 }, { "epoch": 5.632478632478632, "grad_norm": 19.452077865600586, "learning_rate": 7.183760683760684e-06, "loss": 1.1446, "step": 659 }, { "epoch": 5.641025641025641, "grad_norm": 14.513677597045898, "learning_rate": 7.17948717948718e-06, "loss": 1.0527, "step": 660 }, { "epoch": 5.64957264957265, "grad_norm": 27.67446517944336, "learning_rate": 7.175213675213676e-06, "loss": 1.1953, "step": 661 }, { "epoch": 5.6581196581196584, "grad_norm": 12.137639999389648, "learning_rate": 7.170940170940172e-06, "loss": 1.1127, "step": 662 }, { "epoch": 5.666666666666667, "grad_norm": 17.2878475189209, "learning_rate": 7.166666666666667e-06, "loss": 1.0475, "step": 663 }, { "epoch": 5.6752136752136755, "grad_norm": 28.070842742919922, "learning_rate": 7.162393162393163e-06, "loss": 1.6271, "step": 664 }, { "epoch": 5.683760683760684, "grad_norm": 17.74942398071289, "learning_rate": 7.158119658119659e-06, "loss": 1.1759, "step": 665 }, { "epoch": 5.6923076923076925, "grad_norm": 19.545486450195312, "learning_rate": 7.153846153846155e-06, "loss": 0.9753, "step": 666 }, { "epoch": 5.700854700854701, "grad_norm": 24.34153938293457, "learning_rate": 7.1495726495726505e-06, "loss": 1.0905, "step": 667 }, { "epoch": 5.7094017094017095, "grad_norm": 211.7845001220703, "learning_rate": 7.145299145299146e-06, "loss": 1.6455, "step": 668 }, { "epoch": 5.717948717948718, "grad_norm": 14.03074836730957, "learning_rate": 7.1410256410256414e-06, "loss": 1.3728, "step": 669 }, { "epoch": 5.726495726495727, "grad_norm": 27.600345611572266, "learning_rate": 7.136752136752137e-06, "loss": 1.4212, "step": 670 }, { "epoch": 5.735042735042735, "grad_norm": 15.755846977233887, "learning_rate": 7.132478632478633e-06, "loss": 1.148, "step": 671 }, { "epoch": 5.743589743589744, "grad_norm": 12.816133499145508, "learning_rate": 7.128205128205129e-06, "loss": 1.0053, "step": 672 }, { "epoch": 5.752136752136752, "grad_norm": 25.097660064697266, "learning_rate": 7.123931623931625e-06, "loss": 1.1561, "step": 673 }, { "epoch": 5.760683760683761, "grad_norm": 19.249279022216797, "learning_rate": 7.119658119658121e-06, "loss": 1.2582, "step": 674 }, { "epoch": 5.769230769230769, "grad_norm": 18.606924057006836, "learning_rate": 7.115384615384616e-06, "loss": 0.8569, "step": 675 }, { "epoch": 5.777777777777778, "grad_norm": 20.2148380279541, "learning_rate": 7.111111111111112e-06, "loss": 1.1126, "step": 676 }, { "epoch": 5.786324786324786, "grad_norm": 18.623268127441406, "learning_rate": 7.106837606837608e-06, "loss": 1.6129, "step": 677 }, { "epoch": 5.794871794871795, "grad_norm": 14.888258934020996, "learning_rate": 7.102564102564104e-06, "loss": 1.2533, "step": 678 }, { "epoch": 5.803418803418803, "grad_norm": 15.351551055908203, "learning_rate": 7.0982905982905995e-06, "loss": 1.2392, "step": 679 }, { "epoch": 5.811965811965812, "grad_norm": 23.243993759155273, "learning_rate": 7.0940170940170945e-06, "loss": 1.3136, "step": 680 }, { "epoch": 5.82051282051282, "grad_norm": 18.346277236938477, "learning_rate": 7.0897435897435904e-06, "loss": 1.5691, "step": 681 }, { "epoch": 5.829059829059829, "grad_norm": 12.904829025268555, "learning_rate": 7.085470085470086e-06, "loss": 0.9248, "step": 682 }, { "epoch": 5.837606837606837, "grad_norm": 13.263056755065918, "learning_rate": 7.081196581196582e-06, "loss": 1.0555, "step": 683 }, { "epoch": 5.846153846153846, "grad_norm": 19.311899185180664, "learning_rate": 7.076923076923078e-06, "loss": 1.4341, "step": 684 }, { "epoch": 5.854700854700854, "grad_norm": 282.1452331542969, "learning_rate": 7.072649572649574e-06, "loss": 1.9797, "step": 685 }, { "epoch": 5.863247863247864, "grad_norm": 14.317438125610352, "learning_rate": 7.068376068376069e-06, "loss": 0.839, "step": 686 }, { "epoch": 5.871794871794872, "grad_norm": 13.549150466918945, "learning_rate": 7.064102564102565e-06, "loss": 1.1003, "step": 687 }, { "epoch": 5.880341880341881, "grad_norm": 14.283610343933105, "learning_rate": 7.059829059829061e-06, "loss": 1.0297, "step": 688 }, { "epoch": 5.888888888888889, "grad_norm": 18.737884521484375, "learning_rate": 7.055555555555557e-06, "loss": 0.9817, "step": 689 }, { "epoch": 5.897435897435898, "grad_norm": 24.12625503540039, "learning_rate": 7.051282051282053e-06, "loss": 1.1837, "step": 690 }, { "epoch": 5.905982905982906, "grad_norm": 11.760732650756836, "learning_rate": 7.0470085470085485e-06, "loss": 1.5131, "step": 691 }, { "epoch": 5.914529914529915, "grad_norm": 16.138668060302734, "learning_rate": 7.0427350427350435e-06, "loss": 0.9569, "step": 692 }, { "epoch": 5.923076923076923, "grad_norm": 17.727285385131836, "learning_rate": 7.038461538461539e-06, "loss": 0.9834, "step": 693 }, { "epoch": 5.931623931623932, "grad_norm": 13.434252738952637, "learning_rate": 7.034188034188035e-06, "loss": 1.3635, "step": 694 }, { "epoch": 5.94017094017094, "grad_norm": 15.587186813354492, "learning_rate": 7.02991452991453e-06, "loss": 1.4814, "step": 695 }, { "epoch": 5.948717948717949, "grad_norm": 31.379039764404297, "learning_rate": 7.025641025641025e-06, "loss": 0.8792, "step": 696 }, { "epoch": 5.957264957264957, "grad_norm": 14.575559616088867, "learning_rate": 7.021367521367521e-06, "loss": 0.8865, "step": 697 }, { "epoch": 5.965811965811966, "grad_norm": 13.55718994140625, "learning_rate": 7.017094017094017e-06, "loss": 0.9564, "step": 698 }, { "epoch": 5.9743589743589745, "grad_norm": 13.288110733032227, "learning_rate": 7.012820512820513e-06, "loss": 0.8117, "step": 699 }, { "epoch": 5.982905982905983, "grad_norm": 14.522254943847656, "learning_rate": 7.008547008547009e-06, "loss": 1.2037, "step": 700 }, { "epoch": 5.9914529914529915, "grad_norm": 14.575456619262695, "learning_rate": 7.004273504273504e-06, "loss": 1.028, "step": 701 }, { "epoch": 6.0, "grad_norm": 13.18249225616455, "learning_rate": 7e-06, "loss": 0.6528, "step": 702 }, { "epoch": 6.0, "eval_loss": 0.4769609868526459, "eval_runtime": 9.253, "eval_samples_per_second": 50.362, "eval_steps_per_second": 6.376, "step": 702 }, { "epoch": 6.0085470085470085, "grad_norm": 17.034433364868164, "learning_rate": 6.995726495726496e-06, "loss": 0.847, "step": 703 }, { "epoch": 6.017094017094017, "grad_norm": 13.455194473266602, "learning_rate": 6.991452991452992e-06, "loss": 0.8545, "step": 704 }, { "epoch": 6.0256410256410255, "grad_norm": 14.511704444885254, "learning_rate": 6.9871794871794876e-06, "loss": 0.9365, "step": 705 }, { "epoch": 6.034188034188034, "grad_norm": 14.325255393981934, "learning_rate": 6.9829059829059835e-06, "loss": 0.869, "step": 706 }, { "epoch": 6.042735042735043, "grad_norm": 12.944524765014648, "learning_rate": 6.9786324786324785e-06, "loss": 1.1417, "step": 707 }, { "epoch": 6.051282051282051, "grad_norm": 14.992669105529785, "learning_rate": 6.974358974358974e-06, "loss": 1.4935, "step": 708 }, { "epoch": 6.05982905982906, "grad_norm": 15.394392013549805, "learning_rate": 6.97008547008547e-06, "loss": 1.519, "step": 709 }, { "epoch": 6.068376068376068, "grad_norm": 12.605085372924805, "learning_rate": 6.965811965811966e-06, "loss": 1.4419, "step": 710 }, { "epoch": 6.076923076923077, "grad_norm": 16.47636604309082, "learning_rate": 6.961538461538462e-06, "loss": 0.9552, "step": 711 }, { "epoch": 6.085470085470085, "grad_norm": 17.04586410522461, "learning_rate": 6.957264957264958e-06, "loss": 0.9847, "step": 712 }, { "epoch": 6.094017094017094, "grad_norm": 15.464738845825195, "learning_rate": 6.952991452991453e-06, "loss": 0.9272, "step": 713 }, { "epoch": 6.102564102564102, "grad_norm": 11.837206840515137, "learning_rate": 6.948717948717949e-06, "loss": 1.1682, "step": 714 }, { "epoch": 6.111111111111111, "grad_norm": 11.013447761535645, "learning_rate": 6.944444444444445e-06, "loss": 1.222, "step": 715 }, { "epoch": 6.119658119658119, "grad_norm": 15.37415885925293, "learning_rate": 6.940170940170941e-06, "loss": 0.9668, "step": 716 }, { "epoch": 6.128205128205128, "grad_norm": 14.077155113220215, "learning_rate": 6.9358974358974366e-06, "loss": 0.8448, "step": 717 }, { "epoch": 6.136752136752137, "grad_norm": 13.440519332885742, "learning_rate": 6.931623931623932e-06, "loss": 0.891, "step": 718 }, { "epoch": 6.145299145299146, "grad_norm": 13.059304237365723, "learning_rate": 6.9273504273504275e-06, "loss": 0.655, "step": 719 }, { "epoch": 6.153846153846154, "grad_norm": 12.96674633026123, "learning_rate": 6.923076923076923e-06, "loss": 0.7755, "step": 720 }, { "epoch": 6.162393162393163, "grad_norm": 10.921567916870117, "learning_rate": 6.918803418803419e-06, "loss": 0.8533, "step": 721 }, { "epoch": 6.170940170940171, "grad_norm": 10.439260482788086, "learning_rate": 6.914529914529915e-06, "loss": 0.8294, "step": 722 }, { "epoch": 6.17948717948718, "grad_norm": 14.948200225830078, "learning_rate": 6.910256410256411e-06, "loss": 0.7326, "step": 723 }, { "epoch": 6.188034188034188, "grad_norm": 12.733176231384277, "learning_rate": 6.905982905982906e-06, "loss": 1.0244, "step": 724 }, { "epoch": 6.196581196581197, "grad_norm": 12.432938575744629, "learning_rate": 6.901709401709402e-06, "loss": 0.7375, "step": 725 }, { "epoch": 6.205128205128205, "grad_norm": 12.047768592834473, "learning_rate": 6.897435897435898e-06, "loss": 0.8348, "step": 726 }, { "epoch": 6.213675213675214, "grad_norm": 19.029287338256836, "learning_rate": 6.893162393162394e-06, "loss": 0.6091, "step": 727 }, { "epoch": 6.222222222222222, "grad_norm": 11.650983810424805, "learning_rate": 6.88888888888889e-06, "loss": 0.9925, "step": 728 }, { "epoch": 6.230769230769231, "grad_norm": 12.12030029296875, "learning_rate": 6.8846153846153855e-06, "loss": 1.0205, "step": 729 }, { "epoch": 6.239316239316239, "grad_norm": 10.283143997192383, "learning_rate": 6.880341880341881e-06, "loss": 0.7726, "step": 730 }, { "epoch": 6.247863247863248, "grad_norm": 12.965302467346191, "learning_rate": 6.8760683760683765e-06, "loss": 1.1761, "step": 731 }, { "epoch": 6.256410256410256, "grad_norm": 9.0562105178833, "learning_rate": 6.871794871794872e-06, "loss": 0.9769, "step": 732 }, { "epoch": 6.264957264957265, "grad_norm": 13.647340774536133, "learning_rate": 6.867521367521368e-06, "loss": 0.7613, "step": 733 }, { "epoch": 6.273504273504273, "grad_norm": 11.598361015319824, "learning_rate": 6.863247863247864e-06, "loss": 0.6236, "step": 734 }, { "epoch": 6.282051282051282, "grad_norm": 10.453935623168945, "learning_rate": 6.858974358974359e-06, "loss": 0.9752, "step": 735 }, { "epoch": 6.2905982905982905, "grad_norm": 14.108942985534668, "learning_rate": 6.854700854700855e-06, "loss": 0.9212, "step": 736 }, { "epoch": 6.299145299145299, "grad_norm": 21.230859756469727, "learning_rate": 6.850427350427351e-06, "loss": 0.9213, "step": 737 }, { "epoch": 6.3076923076923075, "grad_norm": 11.801465034484863, "learning_rate": 6.846153846153847e-06, "loss": 0.8182, "step": 738 }, { "epoch": 6.316239316239316, "grad_norm": 18.9310302734375, "learning_rate": 6.841880341880343e-06, "loss": 0.6214, "step": 739 }, { "epoch": 6.3247863247863245, "grad_norm": 11.773117065429688, "learning_rate": 6.837606837606839e-06, "loss": 0.6221, "step": 740 }, { "epoch": 6.333333333333333, "grad_norm": 187.00250244140625, "learning_rate": 6.833333333333334e-06, "loss": 1.5211, "step": 741 }, { "epoch": 6.3418803418803416, "grad_norm": 70.96250915527344, "learning_rate": 6.82905982905983e-06, "loss": 1.3472, "step": 742 }, { "epoch": 6.35042735042735, "grad_norm": 11.787941932678223, "learning_rate": 6.8247863247863255e-06, "loss": 0.8831, "step": 743 }, { "epoch": 6.358974358974359, "grad_norm": 11.33661937713623, "learning_rate": 6.820512820512821e-06, "loss": 1.0555, "step": 744 }, { "epoch": 6.367521367521368, "grad_norm": 14.255888938903809, "learning_rate": 6.816239316239317e-06, "loss": 0.8246, "step": 745 }, { "epoch": 6.3760683760683765, "grad_norm": 10.89616870880127, "learning_rate": 6.811965811965813e-06, "loss": 1.0179, "step": 746 }, { "epoch": 6.384615384615385, "grad_norm": 9.160380363464355, "learning_rate": 6.807692307692308e-06, "loss": 0.9019, "step": 747 }, { "epoch": 6.3931623931623935, "grad_norm": 12.984644889831543, "learning_rate": 6.803418803418804e-06, "loss": 0.649, "step": 748 }, { "epoch": 6.401709401709402, "grad_norm": 14.073376655578613, "learning_rate": 6.7991452991453e-06, "loss": 0.608, "step": 749 }, { "epoch": 6.410256410256411, "grad_norm": 10.354485511779785, "learning_rate": 6.794871794871796e-06, "loss": 0.8812, "step": 750 }, { "epoch": 6.418803418803419, "grad_norm": 9.121294975280762, "learning_rate": 6.790598290598292e-06, "loss": 0.768, "step": 751 }, { "epoch": 6.427350427350428, "grad_norm": 10.909361839294434, "learning_rate": 6.786324786324787e-06, "loss": 0.8697, "step": 752 }, { "epoch": 6.435897435897436, "grad_norm": 26.324186325073242, "learning_rate": 6.782051282051283e-06, "loss": 1.2437, "step": 753 }, { "epoch": 6.444444444444445, "grad_norm": 11.972411155700684, "learning_rate": 6.777777777777779e-06, "loss": 0.6366, "step": 754 }, { "epoch": 6.452991452991453, "grad_norm": 25.042150497436523, "learning_rate": 6.7735042735042745e-06, "loss": 1.0371, "step": 755 }, { "epoch": 6.461538461538462, "grad_norm": 10.331900596618652, "learning_rate": 6.76923076923077e-06, "loss": 0.5618, "step": 756 }, { "epoch": 6.47008547008547, "grad_norm": 11.925344467163086, "learning_rate": 6.764957264957266e-06, "loss": 0.629, "step": 757 }, { "epoch": 6.478632478632479, "grad_norm": 10.309441566467285, "learning_rate": 6.760683760683761e-06, "loss": 0.7158, "step": 758 }, { "epoch": 6.487179487179487, "grad_norm": 11.374105453491211, "learning_rate": 6.756410256410257e-06, "loss": 0.6909, "step": 759 }, { "epoch": 6.495726495726496, "grad_norm": 11.613142967224121, "learning_rate": 6.752136752136753e-06, "loss": 0.6139, "step": 760 }, { "epoch": 6.504273504273504, "grad_norm": 14.499147415161133, "learning_rate": 6.747863247863249e-06, "loss": 0.7242, "step": 761 }, { "epoch": 6.512820512820513, "grad_norm": 13.683001518249512, "learning_rate": 6.743589743589745e-06, "loss": 0.9246, "step": 762 }, { "epoch": 6.521367521367521, "grad_norm": 11.068865776062012, "learning_rate": 6.739316239316241e-06, "loss": 0.8866, "step": 763 }, { "epoch": 6.52991452991453, "grad_norm": 13.0232572555542, "learning_rate": 6.735042735042736e-06, "loss": 0.86, "step": 764 }, { "epoch": 6.538461538461538, "grad_norm": 10.639331817626953, "learning_rate": 6.730769230769232e-06, "loss": 0.6928, "step": 765 }, { "epoch": 6.547008547008547, "grad_norm": 11.792994499206543, "learning_rate": 6.7264957264957276e-06, "loss": 0.6571, "step": 766 }, { "epoch": 6.555555555555555, "grad_norm": 15.907414436340332, "learning_rate": 6.7222222222222235e-06, "loss": 1.1426, "step": 767 }, { "epoch": 6.564102564102564, "grad_norm": 12.207514762878418, "learning_rate": 6.717948717948718e-06, "loss": 1.0932, "step": 768 }, { "epoch": 6.572649572649572, "grad_norm": 20.145288467407227, "learning_rate": 6.7136752136752135e-06, "loss": 0.9706, "step": 769 }, { "epoch": 6.581196581196581, "grad_norm": 9.820805549621582, "learning_rate": 6.7094017094017094e-06, "loss": 0.4955, "step": 770 }, { "epoch": 6.589743589743589, "grad_norm": 10.385655403137207, "learning_rate": 6.705128205128205e-06, "loss": 1.0172, "step": 771 }, { "epoch": 6.598290598290598, "grad_norm": 11.708373069763184, "learning_rate": 6.700854700854701e-06, "loss": 0.8048, "step": 772 }, { "epoch": 6.6068376068376065, "grad_norm": 9.812984466552734, "learning_rate": 6.696581196581196e-06, "loss": 0.4831, "step": 773 }, { "epoch": 6.615384615384615, "grad_norm": 9.146960258483887, "learning_rate": 6.692307692307692e-06, "loss": 0.6178, "step": 774 }, { "epoch": 6.6239316239316235, "grad_norm": 13.61231517791748, "learning_rate": 6.688034188034188e-06, "loss": 0.7812, "step": 775 }, { "epoch": 6.632478632478632, "grad_norm": 10.349262237548828, "learning_rate": 6.683760683760684e-06, "loss": 0.819, "step": 776 }, { "epoch": 6.641025641025641, "grad_norm": 48.387847900390625, "learning_rate": 6.67948717948718e-06, "loss": 1.5294, "step": 777 }, { "epoch": 6.64957264957265, "grad_norm": 9.540630340576172, "learning_rate": 6.675213675213676e-06, "loss": 0.6564, "step": 778 }, { "epoch": 6.6581196581196584, "grad_norm": 10.83983039855957, "learning_rate": 6.670940170940171e-06, "loss": 0.5109, "step": 779 }, { "epoch": 6.666666666666667, "grad_norm": 15.380743026733398, "learning_rate": 6.666666666666667e-06, "loss": 0.6504, "step": 780 }, { "epoch": 6.6752136752136755, "grad_norm": 16.796918869018555, "learning_rate": 6.6623931623931625e-06, "loss": 0.7944, "step": 781 }, { "epoch": 6.683760683760684, "grad_norm": 39.64078140258789, "learning_rate": 6.6581196581196584e-06, "loss": 0.6929, "step": 782 }, { "epoch": 6.6923076923076925, "grad_norm": 7.730568885803223, "learning_rate": 6.653846153846154e-06, "loss": 0.6284, "step": 783 }, { "epoch": 6.700854700854701, "grad_norm": 7.840725898742676, "learning_rate": 6.64957264957265e-06, "loss": 0.5113, "step": 784 }, { "epoch": 6.7094017094017095, "grad_norm": 13.925577163696289, "learning_rate": 6.645299145299145e-06, "loss": 0.6846, "step": 785 }, { "epoch": 6.717948717948718, "grad_norm": 10.926531791687012, "learning_rate": 6.641025641025641e-06, "loss": 1.3245, "step": 786 }, { "epoch": 6.726495726495727, "grad_norm": 10.698541641235352, "learning_rate": 6.636752136752137e-06, "loss": 0.6025, "step": 787 }, { "epoch": 6.735042735042735, "grad_norm": 7.572136878967285, "learning_rate": 6.632478632478633e-06, "loss": 0.5473, "step": 788 }, { "epoch": 6.743589743589744, "grad_norm": 26.242990493774414, "learning_rate": 6.628205128205129e-06, "loss": 0.5637, "step": 789 }, { "epoch": 6.752136752136752, "grad_norm": 8.79776668548584, "learning_rate": 6.623931623931624e-06, "loss": 0.7595, "step": 790 }, { "epoch": 6.760683760683761, "grad_norm": 8.951017379760742, "learning_rate": 6.61965811965812e-06, "loss": 1.0365, "step": 791 }, { "epoch": 6.769230769230769, "grad_norm": 13.799118041992188, "learning_rate": 6.615384615384616e-06, "loss": 1.4206, "step": 792 }, { "epoch": 6.777777777777778, "grad_norm": 674.3671875, "learning_rate": 6.6111111111111115e-06, "loss": 1.1752, "step": 793 }, { "epoch": 6.786324786324786, "grad_norm": 8.110879898071289, "learning_rate": 6.606837606837607e-06, "loss": 0.4668, "step": 794 }, { "epoch": 6.794871794871795, "grad_norm": 8.119854927062988, "learning_rate": 6.602564102564103e-06, "loss": 0.7689, "step": 795 }, { "epoch": 6.803418803418803, "grad_norm": 11.039762496948242, "learning_rate": 6.598290598290598e-06, "loss": 0.5636, "step": 796 }, { "epoch": 6.811965811965812, "grad_norm": 12.724084854125977, "learning_rate": 6.594017094017094e-06, "loss": 0.5072, "step": 797 }, { "epoch": 6.82051282051282, "grad_norm": 12.196049690246582, "learning_rate": 6.58974358974359e-06, "loss": 0.5073, "step": 798 }, { "epoch": 6.829059829059829, "grad_norm": 9.072951316833496, "learning_rate": 6.585470085470086e-06, "loss": 0.4855, "step": 799 }, { "epoch": 6.837606837606837, "grad_norm": 10.53836441040039, "learning_rate": 6.581196581196582e-06, "loss": 1.0017, "step": 800 }, { "epoch": 6.846153846153846, "grad_norm": 7.728690147399902, "learning_rate": 6.576923076923078e-06, "loss": 0.5784, "step": 801 }, { "epoch": 6.854700854700854, "grad_norm": 28.362455368041992, "learning_rate": 6.572649572649573e-06, "loss": 1.0295, "step": 802 }, { "epoch": 6.863247863247864, "grad_norm": 7.291123390197754, "learning_rate": 6.568376068376069e-06, "loss": 0.7836, "step": 803 }, { "epoch": 6.871794871794872, "grad_norm": 9.566614151000977, "learning_rate": 6.564102564102565e-06, "loss": 0.9979, "step": 804 }, { "epoch": 6.880341880341881, "grad_norm": 13.544408798217773, "learning_rate": 6.5598290598290605e-06, "loss": 0.5354, "step": 805 }, { "epoch": 6.888888888888889, "grad_norm": 8.546881675720215, "learning_rate": 6.555555555555556e-06, "loss": 0.4689, "step": 806 }, { "epoch": 6.897435897435898, "grad_norm": 8.94822883605957, "learning_rate": 6.5512820512820515e-06, "loss": 0.4432, "step": 807 }, { "epoch": 6.905982905982906, "grad_norm": 6.5176544189453125, "learning_rate": 6.547008547008547e-06, "loss": 0.6747, "step": 808 }, { "epoch": 6.914529914529915, "grad_norm": 9.48947811126709, "learning_rate": 6.542735042735043e-06, "loss": 0.4268, "step": 809 }, { "epoch": 6.923076923076923, "grad_norm": 11.432586669921875, "learning_rate": 6.538461538461539e-06, "loss": 0.5486, "step": 810 }, { "epoch": 6.931623931623932, "grad_norm": 7.585604667663574, "learning_rate": 6.534188034188035e-06, "loss": 0.4412, "step": 811 }, { "epoch": 6.94017094017094, "grad_norm": 7.860292911529541, "learning_rate": 6.529914529914531e-06, "loss": 0.6428, "step": 812 }, { "epoch": 6.948717948717949, "grad_norm": 27.83890151977539, "learning_rate": 6.525641025641026e-06, "loss": 0.6735, "step": 813 }, { "epoch": 6.957264957264957, "grad_norm": 10.266451835632324, "learning_rate": 6.521367521367522e-06, "loss": 0.6757, "step": 814 }, { "epoch": 6.965811965811966, "grad_norm": 8.839099884033203, "learning_rate": 6.517094017094018e-06, "loss": 0.7897, "step": 815 }, { "epoch": 6.9743589743589745, "grad_norm": 10.037760734558105, "learning_rate": 6.512820512820514e-06, "loss": 0.7133, "step": 816 }, { "epoch": 6.982905982905983, "grad_norm": 14.50278377532959, "learning_rate": 6.5085470085470095e-06, "loss": 1.0051, "step": 817 }, { "epoch": 6.9914529914529915, "grad_norm": 8.775527000427246, "learning_rate": 6.504273504273505e-06, "loss": 0.8769, "step": 818 }, { "epoch": 7.0, "grad_norm": 8.891378402709961, "learning_rate": 6.5000000000000004e-06, "loss": 0.9586, "step": 819 }, { "epoch": 7.0, "eval_loss": 0.23673956096172333, "eval_runtime": 9.3447, "eval_samples_per_second": 49.868, "eval_steps_per_second": 6.314, "step": 819 }, { "epoch": 7.0085470085470085, "grad_norm": 8.925857543945312, "learning_rate": 6.495726495726496e-06, "loss": 1.0406, "step": 820 }, { "epoch": 7.017094017094017, "grad_norm": 8.222796440124512, "learning_rate": 6.491452991452992e-06, "loss": 0.4911, "step": 821 }, { "epoch": 7.0256410256410255, "grad_norm": 11.528886795043945, "learning_rate": 6.487179487179488e-06, "loss": 0.8292, "step": 822 }, { "epoch": 7.034188034188034, "grad_norm": 7.9031524658203125, "learning_rate": 6.482905982905984e-06, "loss": 0.5319, "step": 823 }, { "epoch": 7.042735042735043, "grad_norm": 6.788857936859131, "learning_rate": 6.478632478632479e-06, "loss": 0.431, "step": 824 }, { "epoch": 7.051282051282051, "grad_norm": 8.84765911102295, "learning_rate": 6.474358974358975e-06, "loss": 0.6417, "step": 825 }, { "epoch": 7.05982905982906, "grad_norm": 7.517561435699463, "learning_rate": 6.470085470085471e-06, "loss": 0.5828, "step": 826 }, { "epoch": 7.068376068376068, "grad_norm": 9.86832332611084, "learning_rate": 6.465811965811967e-06, "loss": 0.5851, "step": 827 }, { "epoch": 7.076923076923077, "grad_norm": 9.632494926452637, "learning_rate": 6.461538461538463e-06, "loss": 0.769, "step": 828 }, { "epoch": 7.085470085470085, "grad_norm": 9.874857902526855, "learning_rate": 6.4572649572649585e-06, "loss": 0.4393, "step": 829 }, { "epoch": 7.094017094017094, "grad_norm": 11.78085994720459, "learning_rate": 6.4529914529914535e-06, "loss": 0.8784, "step": 830 }, { "epoch": 7.102564102564102, "grad_norm": 8.85053825378418, "learning_rate": 6.4487179487179494e-06, "loss": 0.5911, "step": 831 }, { "epoch": 7.111111111111111, "grad_norm": 12.405013084411621, "learning_rate": 6.444444444444445e-06, "loss": 0.4941, "step": 832 }, { "epoch": 7.119658119658119, "grad_norm": 12.237760543823242, "learning_rate": 6.440170940170941e-06, "loss": 0.4468, "step": 833 }, { "epoch": 7.128205128205128, "grad_norm": 7.945899486541748, "learning_rate": 6.435897435897437e-06, "loss": 0.4101, "step": 834 }, { "epoch": 7.136752136752137, "grad_norm": 10.743217468261719, "learning_rate": 6.431623931623933e-06, "loss": 0.679, "step": 835 }, { "epoch": 7.145299145299146, "grad_norm": 7.700406551361084, "learning_rate": 6.427350427350428e-06, "loss": 0.5067, "step": 836 }, { "epoch": 7.153846153846154, "grad_norm": 8.401918411254883, "learning_rate": 6.423076923076924e-06, "loss": 0.5893, "step": 837 }, { "epoch": 7.162393162393163, "grad_norm": 23.065881729125977, "learning_rate": 6.41880341880342e-06, "loss": 0.6768, "step": 838 }, { "epoch": 7.170940170940171, "grad_norm": 38.71855545043945, "learning_rate": 6.414529914529916e-06, "loss": 0.8828, "step": 839 }, { "epoch": 7.17948717948718, "grad_norm": 12.142110824584961, "learning_rate": 6.410256410256412e-06, "loss": 0.5444, "step": 840 }, { "epoch": 7.188034188034188, "grad_norm": 69.4731674194336, "learning_rate": 6.405982905982906e-06, "loss": 0.7768, "step": 841 }, { "epoch": 7.196581196581197, "grad_norm": 15.926841735839844, "learning_rate": 6.401709401709402e-06, "loss": 0.4348, "step": 842 }, { "epoch": 7.205128205128205, "grad_norm": 6.8418965339660645, "learning_rate": 6.397435897435898e-06, "loss": 0.3821, "step": 843 }, { "epoch": 7.213675213675214, "grad_norm": 6.716574192047119, "learning_rate": 6.3931623931623935e-06, "loss": 0.3621, "step": 844 }, { "epoch": 7.222222222222222, "grad_norm": 7.452919006347656, "learning_rate": 6.3888888888888885e-06, "loss": 0.4997, "step": 845 }, { "epoch": 7.230769230769231, "grad_norm": 11.502019882202148, "learning_rate": 6.384615384615384e-06, "loss": 0.8017, "step": 846 }, { "epoch": 7.239316239316239, "grad_norm": 7.349746227264404, "learning_rate": 6.38034188034188e-06, "loss": 0.2745, "step": 847 }, { "epoch": 7.247863247863248, "grad_norm": 6.269787311553955, "learning_rate": 6.376068376068376e-06, "loss": 0.4131, "step": 848 }, { "epoch": 7.256410256410256, "grad_norm": 9.56203842163086, "learning_rate": 6.371794871794872e-06, "loss": 0.8147, "step": 849 }, { "epoch": 7.264957264957265, "grad_norm": 7.358108043670654, "learning_rate": 6.367521367521368e-06, "loss": 0.3552, "step": 850 }, { "epoch": 7.273504273504273, "grad_norm": 7.6359782218933105, "learning_rate": 6.363247863247863e-06, "loss": 0.3302, "step": 851 }, { "epoch": 7.282051282051282, "grad_norm": 7.356925010681152, "learning_rate": 6.358974358974359e-06, "loss": 0.2927, "step": 852 }, { "epoch": 7.2905982905982905, "grad_norm": 11.097757339477539, "learning_rate": 6.354700854700855e-06, "loss": 0.8117, "step": 853 }, { "epoch": 7.299145299145299, "grad_norm": 10.301170349121094, "learning_rate": 6.350427350427351e-06, "loss": 0.4044, "step": 854 }, { "epoch": 7.3076923076923075, "grad_norm": 7.116042613983154, "learning_rate": 6.3461538461538466e-06, "loss": 0.289, "step": 855 }, { "epoch": 7.316239316239316, "grad_norm": 7.453964710235596, "learning_rate": 6.3418803418803425e-06, "loss": 0.4652, "step": 856 }, { "epoch": 7.3247863247863245, "grad_norm": 11.864774703979492, "learning_rate": 6.3376068376068375e-06, "loss": 0.4667, "step": 857 }, { "epoch": 7.333333333333333, "grad_norm": 8.79547119140625, "learning_rate": 6.333333333333333e-06, "loss": 0.2874, "step": 858 }, { "epoch": 7.3418803418803416, "grad_norm": 10.173043251037598, "learning_rate": 6.329059829059829e-06, "loss": 0.6844, "step": 859 }, { "epoch": 7.35042735042735, "grad_norm": 9.26555061340332, "learning_rate": 6.324786324786325e-06, "loss": 0.2903, "step": 860 }, { "epoch": 7.358974358974359, "grad_norm": 10.274518013000488, "learning_rate": 6.320512820512821e-06, "loss": 0.7824, "step": 861 }, { "epoch": 7.367521367521368, "grad_norm": 7.104451656341553, "learning_rate": 6.316239316239316e-06, "loss": 0.3024, "step": 862 }, { "epoch": 7.3760683760683765, "grad_norm": 9.522738456726074, "learning_rate": 6.311965811965812e-06, "loss": 0.3219, "step": 863 }, { "epoch": 7.384615384615385, "grad_norm": 10.145588874816895, "learning_rate": 6.307692307692308e-06, "loss": 0.5319, "step": 864 }, { "epoch": 7.3931623931623935, "grad_norm": 8.828988075256348, "learning_rate": 6.303418803418804e-06, "loss": 0.3286, "step": 865 }, { "epoch": 7.401709401709402, "grad_norm": 7.314462661743164, "learning_rate": 6.2991452991453e-06, "loss": 0.2951, "step": 866 }, { "epoch": 7.410256410256411, "grad_norm": 13.465666770935059, "learning_rate": 6.2948717948717956e-06, "loss": 0.4046, "step": 867 }, { "epoch": 7.418803418803419, "grad_norm": 12.40607738494873, "learning_rate": 6.290598290598291e-06, "loss": 0.71, "step": 868 }, { "epoch": 7.427350427350428, "grad_norm": 9.282904624938965, "learning_rate": 6.2863247863247865e-06, "loss": 0.4083, "step": 869 }, { "epoch": 7.435897435897436, "grad_norm": 5.755247116088867, "learning_rate": 6.282051282051282e-06, "loss": 0.3858, "step": 870 }, { "epoch": 7.444444444444445, "grad_norm": 6.996497631072998, "learning_rate": 6.277777777777778e-06, "loss": 0.2692, "step": 871 }, { "epoch": 7.452991452991453, "grad_norm": 7.235395431518555, "learning_rate": 6.273504273504274e-06, "loss": 0.3936, "step": 872 }, { "epoch": 7.461538461538462, "grad_norm": 14.275704383850098, "learning_rate": 6.26923076923077e-06, "loss": 0.4022, "step": 873 }, { "epoch": 7.47008547008547, "grad_norm": 10.365689277648926, "learning_rate": 6.264957264957265e-06, "loss": 1.0508, "step": 874 }, { "epoch": 7.478632478632479, "grad_norm": 5.840590000152588, "learning_rate": 6.260683760683761e-06, "loss": 0.2511, "step": 875 }, { "epoch": 7.487179487179487, "grad_norm": 10.25346851348877, "learning_rate": 6.256410256410257e-06, "loss": 0.5836, "step": 876 }, { "epoch": 7.495726495726496, "grad_norm": 27.662694931030273, "learning_rate": 6.252136752136753e-06, "loss": 0.7677, "step": 877 }, { "epoch": 7.504273504273504, "grad_norm": 5.840217590332031, "learning_rate": 6.247863247863249e-06, "loss": 0.3889, "step": 878 }, { "epoch": 7.512820512820513, "grad_norm": 9.813179016113281, "learning_rate": 6.243589743589744e-06, "loss": 0.8929, "step": 879 }, { "epoch": 7.521367521367521, "grad_norm": 5.49755334854126, "learning_rate": 6.23931623931624e-06, "loss": 0.2712, "step": 880 }, { "epoch": 7.52991452991453, "grad_norm": 7.17311429977417, "learning_rate": 6.2350427350427355e-06, "loss": 0.3071, "step": 881 }, { "epoch": 7.538461538461538, "grad_norm": 7.706870079040527, "learning_rate": 6.230769230769231e-06, "loss": 0.3797, "step": 882 }, { "epoch": 7.547008547008547, "grad_norm": 7.891415596008301, "learning_rate": 6.226495726495727e-06, "loss": 0.5352, "step": 883 }, { "epoch": 7.555555555555555, "grad_norm": 8.746044158935547, "learning_rate": 6.222222222222223e-06, "loss": 0.263, "step": 884 }, { "epoch": 7.564102564102564, "grad_norm": 9.096441268920898, "learning_rate": 6.217948717948718e-06, "loss": 0.2736, "step": 885 }, { "epoch": 7.572649572649572, "grad_norm": 7.031003475189209, "learning_rate": 6.213675213675214e-06, "loss": 0.4705, "step": 886 }, { "epoch": 7.581196581196581, "grad_norm": 6.6503143310546875, "learning_rate": 6.20940170940171e-06, "loss": 0.3285, "step": 887 }, { "epoch": 7.589743589743589, "grad_norm": 5.398913383483887, "learning_rate": 6.205128205128206e-06, "loss": 0.41, "step": 888 }, { "epoch": 7.598290598290598, "grad_norm": 7.47569465637207, "learning_rate": 6.200854700854702e-06, "loss": 0.4005, "step": 889 }, { "epoch": 7.6068376068376065, "grad_norm": 8.79906940460205, "learning_rate": 6.196581196581198e-06, "loss": 0.2608, "step": 890 }, { "epoch": 7.615384615384615, "grad_norm": 7.604002475738525, "learning_rate": 6.192307692307693e-06, "loss": 0.577, "step": 891 }, { "epoch": 7.6239316239316235, "grad_norm": 12.666848182678223, "learning_rate": 6.188034188034189e-06, "loss": 0.7296, "step": 892 }, { "epoch": 7.632478632478632, "grad_norm": 20.92390251159668, "learning_rate": 6.1837606837606845e-06, "loss": 0.9276, "step": 893 }, { "epoch": 7.641025641025641, "grad_norm": 6.779317855834961, "learning_rate": 6.17948717948718e-06, "loss": 0.818, "step": 894 }, { "epoch": 7.64957264957265, "grad_norm": 5.249539852142334, "learning_rate": 6.175213675213676e-06, "loss": 0.2117, "step": 895 }, { "epoch": 7.6581196581196584, "grad_norm": 23.55508041381836, "learning_rate": 6.170940170940171e-06, "loss": 0.5239, "step": 896 }, { "epoch": 7.666666666666667, "grad_norm": 11.711256980895996, "learning_rate": 6.166666666666667e-06, "loss": 0.6595, "step": 897 }, { "epoch": 7.6752136752136755, "grad_norm": 6.641115188598633, "learning_rate": 6.162393162393163e-06, "loss": 0.4888, "step": 898 }, { "epoch": 7.683760683760684, "grad_norm": 7.913390159606934, "learning_rate": 6.158119658119659e-06, "loss": 0.66, "step": 899 }, { "epoch": 7.6923076923076925, "grad_norm": 17.927574157714844, "learning_rate": 6.153846153846155e-06, "loss": 0.9603, "step": 900 }, { "epoch": 7.700854700854701, "grad_norm": 4.567203998565674, "learning_rate": 6.149572649572651e-06, "loss": 0.1638, "step": 901 }, { "epoch": 7.7094017094017095, "grad_norm": 5.995935440063477, "learning_rate": 6.145299145299146e-06, "loss": 0.6852, "step": 902 }, { "epoch": 7.717948717948718, "grad_norm": 8.323802947998047, "learning_rate": 6.141025641025642e-06, "loss": 0.5293, "step": 903 }, { "epoch": 7.726495726495727, "grad_norm": 6.8586859703063965, "learning_rate": 6.136752136752138e-06, "loss": 0.3265, "step": 904 }, { "epoch": 7.735042735042735, "grad_norm": 6.507427215576172, "learning_rate": 6.1324786324786335e-06, "loss": 0.2841, "step": 905 }, { "epoch": 7.743589743589744, "grad_norm": 6.789999485015869, "learning_rate": 6.128205128205129e-06, "loss": 0.4236, "step": 906 }, { "epoch": 7.752136752136752, "grad_norm": 19.444454193115234, "learning_rate": 6.123931623931625e-06, "loss": 0.2829, "step": 907 }, { "epoch": 7.760683760683761, "grad_norm": 31.564800262451172, "learning_rate": 6.11965811965812e-06, "loss": 1.093, "step": 908 }, { "epoch": 7.769230769230769, "grad_norm": 9.956007957458496, "learning_rate": 6.115384615384616e-06, "loss": 0.6749, "step": 909 }, { "epoch": 7.777777777777778, "grad_norm": 5.193087577819824, "learning_rate": 6.111111111111112e-06, "loss": 0.1986, "step": 910 }, { "epoch": 7.786324786324786, "grad_norm": 4.792945384979248, "learning_rate": 6.106837606837608e-06, "loss": 0.5179, "step": 911 }, { "epoch": 7.794871794871795, "grad_norm": 20.602317810058594, "learning_rate": 6.102564102564104e-06, "loss": 1.0343, "step": 912 }, { "epoch": 7.803418803418803, "grad_norm": 22.205543518066406, "learning_rate": 6.098290598290599e-06, "loss": 0.4921, "step": 913 }, { "epoch": 7.811965811965812, "grad_norm": 13.392712593078613, "learning_rate": 6.094017094017095e-06, "loss": 0.9058, "step": 914 }, { "epoch": 7.82051282051282, "grad_norm": 6.262679100036621, "learning_rate": 6.08974358974359e-06, "loss": 0.3877, "step": 915 }, { "epoch": 7.829059829059829, "grad_norm": 12.727428436279297, "learning_rate": 6.085470085470086e-06, "loss": 0.4477, "step": 916 }, { "epoch": 7.837606837606837, "grad_norm": 6.595224380493164, "learning_rate": 6.081196581196581e-06, "loss": 0.5553, "step": 917 }, { "epoch": 7.846153846153846, "grad_norm": 6.815043926239014, "learning_rate": 6.076923076923077e-06, "loss": 0.2978, "step": 918 }, { "epoch": 7.854700854700854, "grad_norm": 11.751949310302734, "learning_rate": 6.0726495726495726e-06, "loss": 0.5509, "step": 919 }, { "epoch": 7.863247863247864, "grad_norm": 6.067570209503174, "learning_rate": 6.0683760683760684e-06, "loss": 0.475, "step": 920 }, { "epoch": 7.871794871794872, "grad_norm": 7.4297919273376465, "learning_rate": 6.064102564102564e-06, "loss": 0.5073, "step": 921 }, { "epoch": 7.880341880341881, "grad_norm": 6.778268337249756, "learning_rate": 6.05982905982906e-06, "loss": 0.4718, "step": 922 }, { "epoch": 7.888888888888889, "grad_norm": 9.401915550231934, "learning_rate": 6.055555555555555e-06, "loss": 0.7151, "step": 923 }, { "epoch": 7.897435897435898, "grad_norm": 6.359888553619385, "learning_rate": 6.051282051282051e-06, "loss": 0.3175, "step": 924 }, { "epoch": 7.905982905982906, "grad_norm": 7.036016464233398, "learning_rate": 6.047008547008547e-06, "loss": 0.3172, "step": 925 }, { "epoch": 7.914529914529915, "grad_norm": 5.980124473571777, "learning_rate": 6.042735042735043e-06, "loss": 0.2949, "step": 926 }, { "epoch": 7.923076923076923, "grad_norm": 5.738795280456543, "learning_rate": 6.038461538461539e-06, "loss": 0.2454, "step": 927 }, { "epoch": 7.931623931623932, "grad_norm": 4.688748359680176, "learning_rate": 6.034188034188035e-06, "loss": 0.1949, "step": 928 }, { "epoch": 7.94017094017094, "grad_norm": 7.2333984375, "learning_rate": 6.02991452991453e-06, "loss": 0.2174, "step": 929 }, { "epoch": 7.948717948717949, "grad_norm": 6.005523204803467, "learning_rate": 6.025641025641026e-06, "loss": 0.4216, "step": 930 }, { "epoch": 7.957264957264957, "grad_norm": 6.017541885375977, "learning_rate": 6.0213675213675215e-06, "loss": 0.4904, "step": 931 }, { "epoch": 7.965811965811966, "grad_norm": 19.559003829956055, "learning_rate": 6.0170940170940174e-06, "loss": 0.2616, "step": 932 }, { "epoch": 7.9743589743589745, "grad_norm": 5.360724449157715, "learning_rate": 6.012820512820513e-06, "loss": 0.3629, "step": 933 }, { "epoch": 7.982905982905983, "grad_norm": 9.472721099853516, "learning_rate": 6.008547008547008e-06, "loss": 0.5044, "step": 934 }, { "epoch": 7.9914529914529915, "grad_norm": 6.453597068786621, "learning_rate": 6.004273504273504e-06, "loss": 0.4742, "step": 935 }, { "epoch": 8.0, "grad_norm": 7.647386074066162, "learning_rate": 6e-06, "loss": 0.402, "step": 936 }, { "epoch": 8.0, "eval_loss": 0.1672903448343277, "eval_runtime": 9.3047, "eval_samples_per_second": 50.082, "eval_steps_per_second": 6.341, "step": 936 }, { "epoch": 8.008547008547009, "grad_norm": 5.8361663818359375, "learning_rate": 5.995726495726496e-06, "loss": 0.164, "step": 937 }, { "epoch": 8.017094017094017, "grad_norm": 5.801360130310059, "learning_rate": 5.991452991452992e-06, "loss": 0.2858, "step": 938 }, { "epoch": 8.025641025641026, "grad_norm": 4.43051290512085, "learning_rate": 5.987179487179488e-06, "loss": 0.2068, "step": 939 }, { "epoch": 8.034188034188034, "grad_norm": 6.544061660766602, "learning_rate": 5.982905982905983e-06, "loss": 0.3499, "step": 940 }, { "epoch": 8.042735042735043, "grad_norm": 5.500844955444336, "learning_rate": 5.978632478632479e-06, "loss": 0.3134, "step": 941 }, { "epoch": 8.051282051282051, "grad_norm": 4.286651611328125, "learning_rate": 5.974358974358975e-06, "loss": 0.1767, "step": 942 }, { "epoch": 8.05982905982906, "grad_norm": 13.860437393188477, "learning_rate": 5.9700854700854705e-06, "loss": 0.3913, "step": 943 }, { "epoch": 8.068376068376068, "grad_norm": 5.998767852783203, "learning_rate": 5.9658119658119664e-06, "loss": 0.2275, "step": 944 }, { "epoch": 8.076923076923077, "grad_norm": 9.01196002960205, "learning_rate": 5.961538461538462e-06, "loss": 0.5202, "step": 945 }, { "epoch": 8.085470085470085, "grad_norm": 6.81577730178833, "learning_rate": 5.957264957264957e-06, "loss": 0.5923, "step": 946 }, { "epoch": 8.094017094017094, "grad_norm": 7.400684833526611, "learning_rate": 5.952991452991453e-06, "loss": 0.2883, "step": 947 }, { "epoch": 8.102564102564102, "grad_norm": 16.18587875366211, "learning_rate": 5.948717948717949e-06, "loss": 0.3377, "step": 948 }, { "epoch": 8.11111111111111, "grad_norm": 5.017345428466797, "learning_rate": 5.944444444444445e-06, "loss": 0.3912, "step": 949 }, { "epoch": 8.11965811965812, "grad_norm": 5.300196647644043, "learning_rate": 5.940170940170941e-06, "loss": 0.4056, "step": 950 }, { "epoch": 8.128205128205128, "grad_norm": 6.3473405838012695, "learning_rate": 5.935897435897436e-06, "loss": 0.2559, "step": 951 }, { "epoch": 8.136752136752136, "grad_norm": 12.37689208984375, "learning_rate": 5.931623931623932e-06, "loss": 0.2216, "step": 952 }, { "epoch": 8.145299145299145, "grad_norm": 5.573046684265137, "learning_rate": 5.927350427350428e-06, "loss": 0.2047, "step": 953 }, { "epoch": 8.153846153846153, "grad_norm": 5.033559322357178, "learning_rate": 5.923076923076924e-06, "loss": 0.3661, "step": 954 }, { "epoch": 8.162393162393162, "grad_norm": 5.341614246368408, "learning_rate": 5.9188034188034195e-06, "loss": 0.2597, "step": 955 }, { "epoch": 8.17094017094017, "grad_norm": 8.67937183380127, "learning_rate": 5.914529914529915e-06, "loss": 0.4098, "step": 956 }, { "epoch": 8.179487179487179, "grad_norm": 3.957489252090454, "learning_rate": 5.9102564102564105e-06, "loss": 0.18, "step": 957 }, { "epoch": 8.188034188034187, "grad_norm": 6.377108573913574, "learning_rate": 5.905982905982906e-06, "loss": 0.3414, "step": 958 }, { "epoch": 8.196581196581196, "grad_norm": 8.621227264404297, "learning_rate": 5.901709401709402e-06, "loss": 1.1625, "step": 959 }, { "epoch": 8.205128205128204, "grad_norm": 5.775392532348633, "learning_rate": 5.897435897435898e-06, "loss": 0.4283, "step": 960 }, { "epoch": 8.213675213675213, "grad_norm": 4.522337913513184, "learning_rate": 5.893162393162394e-06, "loss": 0.3432, "step": 961 }, { "epoch": 8.222222222222221, "grad_norm": 5.594667434692383, "learning_rate": 5.88888888888889e-06, "loss": 0.5212, "step": 962 }, { "epoch": 8.23076923076923, "grad_norm": 5.478531837463379, "learning_rate": 5.884615384615385e-06, "loss": 0.2273, "step": 963 }, { "epoch": 8.239316239316238, "grad_norm": 6.08770751953125, "learning_rate": 5.880341880341881e-06, "loss": 0.2673, "step": 964 }, { "epoch": 8.247863247863247, "grad_norm": 7.962898254394531, "learning_rate": 5.876068376068377e-06, "loss": 0.2654, "step": 965 }, { "epoch": 8.256410256410255, "grad_norm": 6.443154335021973, "learning_rate": 5.871794871794873e-06, "loss": 0.2982, "step": 966 }, { "epoch": 8.264957264957266, "grad_norm": 4.689123153686523, "learning_rate": 5.8675213675213685e-06, "loss": 0.3459, "step": 967 }, { "epoch": 8.273504273504274, "grad_norm": 5.446859359741211, "learning_rate": 5.863247863247864e-06, "loss": 0.2792, "step": 968 }, { "epoch": 8.282051282051283, "grad_norm": 5.562478542327881, "learning_rate": 5.8589743589743595e-06, "loss": 0.1939, "step": 969 }, { "epoch": 8.290598290598291, "grad_norm": 4.726650714874268, "learning_rate": 5.854700854700855e-06, "loss": 0.1368, "step": 970 }, { "epoch": 8.2991452991453, "grad_norm": 17.44293785095215, "learning_rate": 5.850427350427351e-06, "loss": 0.3836, "step": 971 }, { "epoch": 8.307692307692308, "grad_norm": 5.568243980407715, "learning_rate": 5.846153846153847e-06, "loss": 0.3674, "step": 972 }, { "epoch": 8.316239316239317, "grad_norm": 3.488147258758545, "learning_rate": 5.841880341880343e-06, "loss": 0.197, "step": 973 }, { "epoch": 8.324786324786325, "grad_norm": 15.902129173278809, "learning_rate": 5.837606837606838e-06, "loss": 0.4199, "step": 974 }, { "epoch": 8.333333333333334, "grad_norm": 8.055335998535156, "learning_rate": 5.833333333333334e-06, "loss": 0.277, "step": 975 }, { "epoch": 8.341880341880342, "grad_norm": 8.122756004333496, "learning_rate": 5.82905982905983e-06, "loss": 0.5572, "step": 976 }, { "epoch": 8.350427350427351, "grad_norm": 5.7439961433410645, "learning_rate": 5.824786324786326e-06, "loss": 0.2031, "step": 977 }, { "epoch": 8.35897435897436, "grad_norm": 4.329511642456055, "learning_rate": 5.820512820512822e-06, "loss": 0.4405, "step": 978 }, { "epoch": 8.367521367521368, "grad_norm": 10.946788787841797, "learning_rate": 5.8162393162393175e-06, "loss": 0.4619, "step": 979 }, { "epoch": 8.376068376068377, "grad_norm": 6.0579352378845215, "learning_rate": 5.8119658119658126e-06, "loss": 0.4679, "step": 980 }, { "epoch": 8.384615384615385, "grad_norm": 5.656944751739502, "learning_rate": 5.8076923076923084e-06, "loss": 0.2395, "step": 981 }, { "epoch": 8.393162393162394, "grad_norm": 5.344303607940674, "learning_rate": 5.803418803418804e-06, "loss": 0.2516, "step": 982 }, { "epoch": 8.401709401709402, "grad_norm": 7.070309638977051, "learning_rate": 5.7991452991453e-06, "loss": 0.3169, "step": 983 }, { "epoch": 8.41025641025641, "grad_norm": 5.168705940246582, "learning_rate": 5.794871794871796e-06, "loss": 0.3007, "step": 984 }, { "epoch": 8.418803418803419, "grad_norm": 3.556293249130249, "learning_rate": 5.790598290598292e-06, "loss": 0.2089, "step": 985 }, { "epoch": 8.427350427350428, "grad_norm": 4.943065166473389, "learning_rate": 5.786324786324787e-06, "loss": 0.2093, "step": 986 }, { "epoch": 8.435897435897436, "grad_norm": 6.991105556488037, "learning_rate": 5.782051282051283e-06, "loss": 0.4671, "step": 987 }, { "epoch": 8.444444444444445, "grad_norm": 5.276190280914307, "learning_rate": 5.777777777777778e-06, "loss": 0.2092, "step": 988 }, { "epoch": 8.452991452991453, "grad_norm": 77.91864776611328, "learning_rate": 5.773504273504273e-06, "loss": 1.7536, "step": 989 }, { "epoch": 8.461538461538462, "grad_norm": 4.864828109741211, "learning_rate": 5.769230769230769e-06, "loss": 0.1669, "step": 990 }, { "epoch": 8.47008547008547, "grad_norm": 4.416967391967773, "learning_rate": 5.764957264957265e-06, "loss": 0.2705, "step": 991 }, { "epoch": 8.478632478632479, "grad_norm": 4.558652400970459, "learning_rate": 5.760683760683761e-06, "loss": 0.4332, "step": 992 }, { "epoch": 8.487179487179487, "grad_norm": 8.17482852935791, "learning_rate": 5.756410256410257e-06, "loss": 0.7286, "step": 993 }, { "epoch": 8.495726495726496, "grad_norm": 7.322425365447998, "learning_rate": 5.7521367521367525e-06, "loss": 0.8554, "step": 994 }, { "epoch": 8.504273504273504, "grad_norm": 4.249075889587402, "learning_rate": 5.7478632478632475e-06, "loss": 0.2442, "step": 995 }, { "epoch": 8.512820512820513, "grad_norm": 4.157267093658447, "learning_rate": 5.743589743589743e-06, "loss": 0.4207, "step": 996 }, { "epoch": 8.521367521367521, "grad_norm": 4.118504047393799, "learning_rate": 5.739316239316239e-06, "loss": 0.1411, "step": 997 }, { "epoch": 8.52991452991453, "grad_norm": 7.273322105407715, "learning_rate": 5.735042735042735e-06, "loss": 0.6269, "step": 998 }, { "epoch": 8.538461538461538, "grad_norm": 4.7668633460998535, "learning_rate": 5.730769230769231e-06, "loss": 0.1894, "step": 999 }, { "epoch": 8.547008547008547, "grad_norm": 5.869007110595703, "learning_rate": 5.726495726495727e-06, "loss": 0.7301, "step": 1000 }, { "epoch": 8.555555555555555, "grad_norm": 5.987617015838623, "learning_rate": 5.722222222222222e-06, "loss": 0.29, "step": 1001 }, { "epoch": 8.564102564102564, "grad_norm": 5.445812702178955, "learning_rate": 5.717948717948718e-06, "loss": 0.4278, "step": 1002 }, { "epoch": 8.572649572649572, "grad_norm": 4.7509002685546875, "learning_rate": 5.713675213675214e-06, "loss": 0.3396, "step": 1003 }, { "epoch": 8.581196581196581, "grad_norm": 5.584397315979004, "learning_rate": 5.70940170940171e-06, "loss": 0.1329, "step": 1004 }, { "epoch": 8.58974358974359, "grad_norm": 4.627229690551758, "learning_rate": 5.705128205128206e-06, "loss": 0.3012, "step": 1005 }, { "epoch": 8.598290598290598, "grad_norm": 7.724045276641846, "learning_rate": 5.7008547008547015e-06, "loss": 0.4876, "step": 1006 }, { "epoch": 8.606837606837606, "grad_norm": 3.488499164581299, "learning_rate": 5.6965811965811965e-06, "loss": 0.2025, "step": 1007 }, { "epoch": 8.615384615384615, "grad_norm": 14.487537384033203, "learning_rate": 5.692307692307692e-06, "loss": 0.6795, "step": 1008 }, { "epoch": 8.623931623931623, "grad_norm": 4.03059196472168, "learning_rate": 5.688034188034188e-06, "loss": 0.2121, "step": 1009 }, { "epoch": 8.632478632478632, "grad_norm": 3.278873920440674, "learning_rate": 5.683760683760684e-06, "loss": 0.3475, "step": 1010 }, { "epoch": 8.64102564102564, "grad_norm": 4.599937915802002, "learning_rate": 5.67948717948718e-06, "loss": 0.2355, "step": 1011 }, { "epoch": 8.649572649572649, "grad_norm": 6.314788818359375, "learning_rate": 5.675213675213675e-06, "loss": 0.2402, "step": 1012 }, { "epoch": 8.658119658119658, "grad_norm": 3.4483532905578613, "learning_rate": 5.670940170940171e-06, "loss": 0.2189, "step": 1013 }, { "epoch": 8.666666666666666, "grad_norm": 299.8923645019531, "learning_rate": 5.666666666666667e-06, "loss": 1.0473, "step": 1014 }, { "epoch": 8.675213675213675, "grad_norm": 13.14855670928955, "learning_rate": 5.662393162393163e-06, "loss": 0.3723, "step": 1015 }, { "epoch": 8.683760683760683, "grad_norm": 6.513180732727051, "learning_rate": 5.658119658119659e-06, "loss": 0.483, "step": 1016 }, { "epoch": 8.692307692307692, "grad_norm": 5.026037693023682, "learning_rate": 5.6538461538461546e-06, "loss": 0.4417, "step": 1017 }, { "epoch": 8.7008547008547, "grad_norm": 176.535888671875, "learning_rate": 5.64957264957265e-06, "loss": 0.5256, "step": 1018 }, { "epoch": 8.709401709401709, "grad_norm": 6.023639678955078, "learning_rate": 5.6452991452991455e-06, "loss": 0.3708, "step": 1019 }, { "epoch": 8.717948717948717, "grad_norm": 16.64018440246582, "learning_rate": 5.641025641025641e-06, "loss": 0.8908, "step": 1020 }, { "epoch": 8.726495726495726, "grad_norm": 2.9167582988739014, "learning_rate": 5.636752136752137e-06, "loss": 0.077, "step": 1021 }, { "epoch": 8.735042735042736, "grad_norm": 3.368325710296631, "learning_rate": 5.632478632478633e-06, "loss": 0.2495, "step": 1022 }, { "epoch": 8.743589743589745, "grad_norm": 3.7961905002593994, "learning_rate": 5.628205128205129e-06, "loss": 0.4427, "step": 1023 }, { "epoch": 8.752136752136753, "grad_norm": 4.661024570465088, "learning_rate": 5.623931623931624e-06, "loss": 0.3092, "step": 1024 }, { "epoch": 8.760683760683762, "grad_norm": 5.1971588134765625, "learning_rate": 5.61965811965812e-06, "loss": 0.2213, "step": 1025 }, { "epoch": 8.76923076923077, "grad_norm": 4.427041530609131, "learning_rate": 5.615384615384616e-06, "loss": 0.2885, "step": 1026 }, { "epoch": 8.777777777777779, "grad_norm": 7.352906703948975, "learning_rate": 5.611111111111112e-06, "loss": 0.2689, "step": 1027 }, { "epoch": 8.786324786324787, "grad_norm": 5.306934833526611, "learning_rate": 5.606837606837608e-06, "loss": 0.3758, "step": 1028 }, { "epoch": 8.794871794871796, "grad_norm": 4.502418041229248, "learning_rate": 5.602564102564103e-06, "loss": 0.4655, "step": 1029 }, { "epoch": 8.803418803418804, "grad_norm": 3.427734851837158, "learning_rate": 5.598290598290599e-06, "loss": 0.1145, "step": 1030 }, { "epoch": 8.811965811965813, "grad_norm": 4.047433376312256, "learning_rate": 5.5940170940170945e-06, "loss": 0.1482, "step": 1031 }, { "epoch": 8.820512820512821, "grad_norm": 3.6860435009002686, "learning_rate": 5.58974358974359e-06, "loss": 0.1152, "step": 1032 }, { "epoch": 8.82905982905983, "grad_norm": 6.792733669281006, "learning_rate": 5.585470085470086e-06, "loss": 0.1732, "step": 1033 }, { "epoch": 8.837606837606838, "grad_norm": 4.222206115722656, "learning_rate": 5.581196581196582e-06, "loss": 0.1259, "step": 1034 }, { "epoch": 8.846153846153847, "grad_norm": 4.376220703125, "learning_rate": 5.576923076923077e-06, "loss": 0.2403, "step": 1035 }, { "epoch": 8.854700854700855, "grad_norm": 3.459076166152954, "learning_rate": 5.572649572649573e-06, "loss": 0.2064, "step": 1036 }, { "epoch": 8.863247863247864, "grad_norm": 6.312697410583496, "learning_rate": 5.568376068376069e-06, "loss": 0.5076, "step": 1037 }, { "epoch": 8.871794871794872, "grad_norm": 10.137848854064941, "learning_rate": 5.564102564102565e-06, "loss": 0.1649, "step": 1038 }, { "epoch": 8.88034188034188, "grad_norm": 6.605007171630859, "learning_rate": 5.559829059829061e-06, "loss": 0.4233, "step": 1039 }, { "epoch": 8.88888888888889, "grad_norm": 3.9786465167999268, "learning_rate": 5.555555555555557e-06, "loss": 0.1801, "step": 1040 }, { "epoch": 8.897435897435898, "grad_norm": 4.40491247177124, "learning_rate": 5.551282051282052e-06, "loss": 0.169, "step": 1041 }, { "epoch": 8.905982905982906, "grad_norm": 4.719818592071533, "learning_rate": 5.547008547008548e-06, "loss": 0.1454, "step": 1042 }, { "epoch": 8.914529914529915, "grad_norm": 2.384941577911377, "learning_rate": 5.5427350427350435e-06, "loss": 0.0723, "step": 1043 }, { "epoch": 8.923076923076923, "grad_norm": 3.258315324783325, "learning_rate": 5.538461538461539e-06, "loss": 0.1023, "step": 1044 }, { "epoch": 8.931623931623932, "grad_norm": 18.745052337646484, "learning_rate": 5.534188034188035e-06, "loss": 0.2673, "step": 1045 }, { "epoch": 8.94017094017094, "grad_norm": 3.788177967071533, "learning_rate": 5.52991452991453e-06, "loss": 0.3173, "step": 1046 }, { "epoch": 8.948717948717949, "grad_norm": 2.734895944595337, "learning_rate": 5.525641025641026e-06, "loss": 0.0834, "step": 1047 }, { "epoch": 8.957264957264957, "grad_norm": 4.158284664154053, "learning_rate": 5.521367521367522e-06, "loss": 0.3414, "step": 1048 }, { "epoch": 8.965811965811966, "grad_norm": 4.875148296356201, "learning_rate": 5.517094017094018e-06, "loss": 0.2729, "step": 1049 }, { "epoch": 8.974358974358974, "grad_norm": 5.2556352615356445, "learning_rate": 5.512820512820514e-06, "loss": 0.1422, "step": 1050 }, { "epoch": 8.982905982905983, "grad_norm": 3.817049980163574, "learning_rate": 5.50854700854701e-06, "loss": 0.2514, "step": 1051 }, { "epoch": 8.991452991452991, "grad_norm": 2.247227668762207, "learning_rate": 5.504273504273505e-06, "loss": 0.0703, "step": 1052 }, { "epoch": 9.0, "grad_norm": 34.36362838745117, "learning_rate": 5.500000000000001e-06, "loss": 0.7433, "step": 1053 }, { "epoch": 9.0, "eval_loss": 0.12675683200359344, "eval_runtime": 9.3141, "eval_samples_per_second": 50.032, "eval_steps_per_second": 6.334, "step": 1053 }, { "epoch": 9.008547008547009, "grad_norm": 5.314228057861328, "learning_rate": 5.495726495726497e-06, "loss": 0.2576, "step": 1054 }, { "epoch": 9.017094017094017, "grad_norm": 34.33782958984375, "learning_rate": 5.4914529914529925e-06, "loss": 0.3833, "step": 1055 }, { "epoch": 9.025641025641026, "grad_norm": 5.440598964691162, "learning_rate": 5.487179487179488e-06, "loss": 0.3898, "step": 1056 }, { "epoch": 9.034188034188034, "grad_norm": 3.561518907546997, "learning_rate": 5.482905982905984e-06, "loss": 0.2197, "step": 1057 }, { "epoch": 9.042735042735043, "grad_norm": 4.7679762840271, "learning_rate": 5.478632478632479e-06, "loss": 0.3885, "step": 1058 }, { "epoch": 9.051282051282051, "grad_norm": 4.694134712219238, "learning_rate": 5.474358974358975e-06, "loss": 0.2532, "step": 1059 }, { "epoch": 9.05982905982906, "grad_norm": 4.347025394439697, "learning_rate": 5.470085470085471e-06, "loss": 0.1949, "step": 1060 }, { "epoch": 9.068376068376068, "grad_norm": 4.064525127410889, "learning_rate": 5.465811965811966e-06, "loss": 0.1597, "step": 1061 }, { "epoch": 9.076923076923077, "grad_norm": 3.78560471534729, "learning_rate": 5.461538461538461e-06, "loss": 0.18, "step": 1062 }, { "epoch": 9.085470085470085, "grad_norm": 7.843743324279785, "learning_rate": 5.457264957264957e-06, "loss": 0.3146, "step": 1063 }, { "epoch": 9.094017094017094, "grad_norm": 8.152037620544434, "learning_rate": 5.452991452991453e-06, "loss": 0.3384, "step": 1064 }, { "epoch": 9.102564102564102, "grad_norm": 3.987872838973999, "learning_rate": 5.448717948717949e-06, "loss": 0.2071, "step": 1065 }, { "epoch": 9.11111111111111, "grad_norm": 3.478532552719116, "learning_rate": 5.444444444444445e-06, "loss": 0.1788, "step": 1066 }, { "epoch": 9.11965811965812, "grad_norm": 3.6598286628723145, "learning_rate": 5.44017094017094e-06, "loss": 0.2459, "step": 1067 }, { "epoch": 9.128205128205128, "grad_norm": 9.528829574584961, "learning_rate": 5.435897435897436e-06, "loss": 0.2046, "step": 1068 }, { "epoch": 9.136752136752136, "grad_norm": 3.3274407386779785, "learning_rate": 5.4316239316239316e-06, "loss": 0.1414, "step": 1069 }, { "epoch": 9.145299145299145, "grad_norm": 5.117324352264404, "learning_rate": 5.4273504273504275e-06, "loss": 0.3636, "step": 1070 }, { "epoch": 9.153846153846153, "grad_norm": 8.604976654052734, "learning_rate": 5.423076923076923e-06, "loss": 0.2723, "step": 1071 }, { "epoch": 9.162393162393162, "grad_norm": 72.67993927001953, "learning_rate": 5.418803418803419e-06, "loss": 0.5863, "step": 1072 }, { "epoch": 9.17094017094017, "grad_norm": 3.8609094619750977, "learning_rate": 5.414529914529914e-06, "loss": 0.1778, "step": 1073 }, { "epoch": 9.179487179487179, "grad_norm": 21.24209976196289, "learning_rate": 5.41025641025641e-06, "loss": 0.2062, "step": 1074 }, { "epoch": 9.188034188034187, "grad_norm": 5.552285194396973, "learning_rate": 5.405982905982906e-06, "loss": 0.4685, "step": 1075 }, { "epoch": 9.196581196581196, "grad_norm": 12.241254806518555, "learning_rate": 5.401709401709402e-06, "loss": 0.4309, "step": 1076 }, { "epoch": 9.205128205128204, "grad_norm": 3.6276049613952637, "learning_rate": 5.397435897435898e-06, "loss": 0.0924, "step": 1077 }, { "epoch": 9.213675213675213, "grad_norm": 10.98838996887207, "learning_rate": 5.393162393162394e-06, "loss": 0.7616, "step": 1078 }, { "epoch": 9.222222222222221, "grad_norm": 4.689146041870117, "learning_rate": 5.388888888888889e-06, "loss": 0.346, "step": 1079 }, { "epoch": 9.23076923076923, "grad_norm": 6.385439872741699, "learning_rate": 5.384615384615385e-06, "loss": 0.2945, "step": 1080 }, { "epoch": 9.239316239316238, "grad_norm": 2.4931023120880127, "learning_rate": 5.3803418803418806e-06, "loss": 0.172, "step": 1081 }, { "epoch": 9.247863247863247, "grad_norm": 3.797539472579956, "learning_rate": 5.3760683760683764e-06, "loss": 0.0927, "step": 1082 }, { "epoch": 9.256410256410255, "grad_norm": 2.7136716842651367, "learning_rate": 5.371794871794872e-06, "loss": 0.0932, "step": 1083 }, { "epoch": 9.264957264957266, "grad_norm": 5.207858085632324, "learning_rate": 5.367521367521367e-06, "loss": 0.1176, "step": 1084 }, { "epoch": 9.273504273504274, "grad_norm": 3.95009183883667, "learning_rate": 5.363247863247863e-06, "loss": 0.3045, "step": 1085 }, { "epoch": 9.282051282051283, "grad_norm": 1.9097685813903809, "learning_rate": 5.358974358974359e-06, "loss": 0.1793, "step": 1086 }, { "epoch": 9.290598290598291, "grad_norm": 3.205216407775879, "learning_rate": 5.354700854700855e-06, "loss": 0.1071, "step": 1087 }, { "epoch": 9.2991452991453, "grad_norm": 3.481822967529297, "learning_rate": 5.350427350427351e-06, "loss": 0.3885, "step": 1088 }, { "epoch": 9.307692307692308, "grad_norm": 11.802562713623047, "learning_rate": 5.346153846153847e-06, "loss": 0.1769, "step": 1089 }, { "epoch": 9.316239316239317, "grad_norm": 3.101505994796753, "learning_rate": 5.341880341880342e-06, "loss": 0.1265, "step": 1090 }, { "epoch": 9.324786324786325, "grad_norm": 5.163032054901123, "learning_rate": 5.337606837606838e-06, "loss": 0.4768, "step": 1091 }, { "epoch": 9.333333333333334, "grad_norm": 1.8217605352401733, "learning_rate": 5.333333333333334e-06, "loss": 0.053, "step": 1092 }, { "epoch": 9.341880341880342, "grad_norm": 2.6139562129974365, "learning_rate": 5.3290598290598295e-06, "loss": 0.0848, "step": 1093 }, { "epoch": 9.350427350427351, "grad_norm": 3.1172311305999756, "learning_rate": 5.3247863247863254e-06, "loss": 0.1076, "step": 1094 }, { "epoch": 9.35897435897436, "grad_norm": 5.907342433929443, "learning_rate": 5.320512820512821e-06, "loss": 0.1737, "step": 1095 }, { "epoch": 9.367521367521368, "grad_norm": 45.74967575073242, "learning_rate": 5.316239316239316e-06, "loss": 0.2455, "step": 1096 }, { "epoch": 9.376068376068377, "grad_norm": 3.1865549087524414, "learning_rate": 5.311965811965812e-06, "loss": 0.2236, "step": 1097 }, { "epoch": 9.384615384615385, "grad_norm": 4.028379917144775, "learning_rate": 5.307692307692308e-06, "loss": 0.1065, "step": 1098 }, { "epoch": 9.393162393162394, "grad_norm": 5.388605117797852, "learning_rate": 5.303418803418804e-06, "loss": 0.2967, "step": 1099 }, { "epoch": 9.401709401709402, "grad_norm": 3.661736249923706, "learning_rate": 5.2991452991453e-06, "loss": 0.1271, "step": 1100 }, { "epoch": 9.41025641025641, "grad_norm": 4.693649768829346, "learning_rate": 5.294871794871795e-06, "loss": 0.7891, "step": 1101 }, { "epoch": 9.418803418803419, "grad_norm": 14.75247573852539, "learning_rate": 5.290598290598291e-06, "loss": 0.707, "step": 1102 }, { "epoch": 9.427350427350428, "grad_norm": 5.123616695404053, "learning_rate": 5.286324786324787e-06, "loss": 0.2424, "step": 1103 }, { "epoch": 9.435897435897436, "grad_norm": 5.946259021759033, "learning_rate": 5.282051282051283e-06, "loss": 0.2558, "step": 1104 }, { "epoch": 9.444444444444445, "grad_norm": 3.3757872581481934, "learning_rate": 5.2777777777777785e-06, "loss": 0.072, "step": 1105 }, { "epoch": 9.452991452991453, "grad_norm": 4.639676094055176, "learning_rate": 5.2735042735042744e-06, "loss": 0.1483, "step": 1106 }, { "epoch": 9.461538461538462, "grad_norm": 5.552156925201416, "learning_rate": 5.2692307692307695e-06, "loss": 0.341, "step": 1107 }, { "epoch": 9.47008547008547, "grad_norm": 10.601661682128906, "learning_rate": 5.264957264957265e-06, "loss": 0.5964, "step": 1108 }, { "epoch": 9.478632478632479, "grad_norm": 4.391530513763428, "learning_rate": 5.260683760683761e-06, "loss": 0.2346, "step": 1109 }, { "epoch": 9.487179487179487, "grad_norm": 3.150240659713745, "learning_rate": 5.256410256410257e-06, "loss": 0.1, "step": 1110 }, { "epoch": 9.495726495726496, "grad_norm": 5.60894775390625, "learning_rate": 5.252136752136753e-06, "loss": 0.397, "step": 1111 }, { "epoch": 9.504273504273504, "grad_norm": 9.21768856048584, "learning_rate": 5.247863247863249e-06, "loss": 0.2292, "step": 1112 }, { "epoch": 9.512820512820513, "grad_norm": 8.351348876953125, "learning_rate": 5.243589743589744e-06, "loss": 0.3129, "step": 1113 }, { "epoch": 9.521367521367521, "grad_norm": 3.0813419818878174, "learning_rate": 5.23931623931624e-06, "loss": 0.2539, "step": 1114 }, { "epoch": 9.52991452991453, "grad_norm": 5.553039073944092, "learning_rate": 5.235042735042736e-06, "loss": 0.1121, "step": 1115 }, { "epoch": 9.538461538461538, "grad_norm": 3.973057746887207, "learning_rate": 5.230769230769232e-06, "loss": 0.4928, "step": 1116 }, { "epoch": 9.547008547008547, "grad_norm": 4.753414630889893, "learning_rate": 5.2264957264957275e-06, "loss": 0.2247, "step": 1117 }, { "epoch": 9.555555555555555, "grad_norm": 7.344094753265381, "learning_rate": 5.2222222222222226e-06, "loss": 0.1405, "step": 1118 }, { "epoch": 9.564102564102564, "grad_norm": 47.83219528198242, "learning_rate": 5.2179487179487185e-06, "loss": 0.3108, "step": 1119 }, { "epoch": 9.572649572649572, "grad_norm": 2.31591796875, "learning_rate": 5.213675213675214e-06, "loss": 0.1019, "step": 1120 }, { "epoch": 9.581196581196581, "grad_norm": 3.871413230895996, "learning_rate": 5.20940170940171e-06, "loss": 0.2562, "step": 1121 }, { "epoch": 9.58974358974359, "grad_norm": 2.1789255142211914, "learning_rate": 5.205128205128206e-06, "loss": 0.0571, "step": 1122 }, { "epoch": 9.598290598290598, "grad_norm": 4.119174957275391, "learning_rate": 5.200854700854702e-06, "loss": 0.2799, "step": 1123 }, { "epoch": 9.606837606837606, "grad_norm": 7.873704433441162, "learning_rate": 5.196581196581197e-06, "loss": 0.2154, "step": 1124 }, { "epoch": 9.615384615384615, "grad_norm": 3.386780023574829, "learning_rate": 5.192307692307693e-06, "loss": 0.1607, "step": 1125 }, { "epoch": 9.623931623931623, "grad_norm": 3.3607964515686035, "learning_rate": 5.188034188034189e-06, "loss": 0.22, "step": 1126 }, { "epoch": 9.632478632478632, "grad_norm": 10.655082702636719, "learning_rate": 5.183760683760685e-06, "loss": 0.2102, "step": 1127 }, { "epoch": 9.64102564102564, "grad_norm": 5.550488471984863, "learning_rate": 5.179487179487181e-06, "loss": 0.347, "step": 1128 }, { "epoch": 9.649572649572649, "grad_norm": 4.184569835662842, "learning_rate": 5.1752136752136765e-06, "loss": 0.183, "step": 1129 }, { "epoch": 9.658119658119658, "grad_norm": 4.892969131469727, "learning_rate": 5.1709401709401716e-06, "loss": 0.2896, "step": 1130 }, { "epoch": 9.666666666666666, "grad_norm": 5.926670074462891, "learning_rate": 5.1666666666666675e-06, "loss": 0.3321, "step": 1131 }, { "epoch": 9.675213675213675, "grad_norm": 11.719461441040039, "learning_rate": 5.162393162393163e-06, "loss": 0.4055, "step": 1132 }, { "epoch": 9.683760683760683, "grad_norm": 3.5666840076446533, "learning_rate": 5.158119658119659e-06, "loss": 0.2318, "step": 1133 }, { "epoch": 9.692307692307692, "grad_norm": 6.800848484039307, "learning_rate": 5.1538461538461534e-06, "loss": 0.1202, "step": 1134 }, { "epoch": 9.7008547008547, "grad_norm": 4.50139856338501, "learning_rate": 5.149572649572649e-06, "loss": 0.1914, "step": 1135 }, { "epoch": 9.709401709401709, "grad_norm": 2.599607467651367, "learning_rate": 5.145299145299145e-06, "loss": 0.0833, "step": 1136 }, { "epoch": 9.717948717948717, "grad_norm": 6.084483623504639, "learning_rate": 5.141025641025641e-06, "loss": 0.0907, "step": 1137 }, { "epoch": 9.726495726495726, "grad_norm": 4.542915344238281, "learning_rate": 5.136752136752137e-06, "loss": 0.4554, "step": 1138 }, { "epoch": 9.735042735042736, "grad_norm": 3.871166229248047, "learning_rate": 5.132478632478632e-06, "loss": 0.3037, "step": 1139 }, { "epoch": 9.743589743589745, "grad_norm": 5.121057033538818, "learning_rate": 5.128205128205128e-06, "loss": 0.1751, "step": 1140 }, { "epoch": 9.752136752136753, "grad_norm": 3.7517125606536865, "learning_rate": 5.123931623931624e-06, "loss": 0.3144, "step": 1141 }, { "epoch": 9.760683760683762, "grad_norm": 1.7604278326034546, "learning_rate": 5.11965811965812e-06, "loss": 0.0649, "step": 1142 }, { "epoch": 9.76923076923077, "grad_norm": 13.68947982788086, "learning_rate": 5.115384615384616e-06, "loss": 0.2184, "step": 1143 }, { "epoch": 9.777777777777779, "grad_norm": 5.716836452484131, "learning_rate": 5.1111111111111115e-06, "loss": 0.1876, "step": 1144 }, { "epoch": 9.786324786324787, "grad_norm": 8.21943187713623, "learning_rate": 5.1068376068376065e-06, "loss": 0.349, "step": 1145 }, { "epoch": 9.794871794871796, "grad_norm": 5.270402908325195, "learning_rate": 5.1025641025641024e-06, "loss": 0.4442, "step": 1146 }, { "epoch": 9.803418803418804, "grad_norm": 2.3825948238372803, "learning_rate": 5.098290598290598e-06, "loss": 0.2237, "step": 1147 }, { "epoch": 9.811965811965813, "grad_norm": 11.812047958374023, "learning_rate": 5.094017094017094e-06, "loss": 0.5122, "step": 1148 }, { "epoch": 9.820512820512821, "grad_norm": 9.14202880859375, "learning_rate": 5.08974358974359e-06, "loss": 0.3407, "step": 1149 }, { "epoch": 9.82905982905983, "grad_norm": 5.273305892944336, "learning_rate": 5.085470085470086e-06, "loss": 0.1702, "step": 1150 }, { "epoch": 9.837606837606838, "grad_norm": 2.995126485824585, "learning_rate": 5.081196581196581e-06, "loss": 0.228, "step": 1151 }, { "epoch": 9.846153846153847, "grad_norm": 4.077675819396973, "learning_rate": 5.076923076923077e-06, "loss": 0.4022, "step": 1152 }, { "epoch": 9.854700854700855, "grad_norm": 2.1732425689697266, "learning_rate": 5.072649572649573e-06, "loss": 0.1178, "step": 1153 }, { "epoch": 9.863247863247864, "grad_norm": 2.905172109603882, "learning_rate": 5.068376068376069e-06, "loss": 0.1718, "step": 1154 }, { "epoch": 9.871794871794872, "grad_norm": 2.702521324157715, "learning_rate": 5.064102564102565e-06, "loss": 0.1488, "step": 1155 }, { "epoch": 9.88034188034188, "grad_norm": 2.414088487625122, "learning_rate": 5.05982905982906e-06, "loss": 0.1034, "step": 1156 }, { "epoch": 9.88888888888889, "grad_norm": 2.618173360824585, "learning_rate": 5.0555555555555555e-06, "loss": 0.0783, "step": 1157 }, { "epoch": 9.897435897435898, "grad_norm": 5.002628803253174, "learning_rate": 5.051282051282051e-06, "loss": 0.1195, "step": 1158 }, { "epoch": 9.905982905982906, "grad_norm": 2.84708833694458, "learning_rate": 5.047008547008547e-06, "loss": 0.0906, "step": 1159 }, { "epoch": 9.914529914529915, "grad_norm": 5.564020156860352, "learning_rate": 5.042735042735043e-06, "loss": 0.2037, "step": 1160 }, { "epoch": 9.923076923076923, "grad_norm": 3.7763166427612305, "learning_rate": 5.038461538461539e-06, "loss": 0.2067, "step": 1161 }, { "epoch": 9.931623931623932, "grad_norm": 2.67268705368042, "learning_rate": 5.034188034188034e-06, "loss": 0.0557, "step": 1162 }, { "epoch": 9.94017094017094, "grad_norm": 2.4144680500030518, "learning_rate": 5.02991452991453e-06, "loss": 0.194, "step": 1163 }, { "epoch": 9.948717948717949, "grad_norm": 2.0716731548309326, "learning_rate": 5.025641025641026e-06, "loss": 0.1253, "step": 1164 }, { "epoch": 9.957264957264957, "grad_norm": 13.20478630065918, "learning_rate": 5.021367521367522e-06, "loss": 0.268, "step": 1165 }, { "epoch": 9.965811965811966, "grad_norm": 2.093698263168335, "learning_rate": 5.017094017094018e-06, "loss": 0.0738, "step": 1166 }, { "epoch": 9.974358974358974, "grad_norm": 2.2758119106292725, "learning_rate": 5.012820512820514e-06, "loss": 0.0804, "step": 1167 }, { "epoch": 9.982905982905983, "grad_norm": 21.843395233154297, "learning_rate": 5.008547008547009e-06, "loss": 0.3298, "step": 1168 }, { "epoch": 9.991452991452991, "grad_norm": 3.0435073375701904, "learning_rate": 5.0042735042735045e-06, "loss": 0.1318, "step": 1169 }, { "epoch": 10.0, "grad_norm": 8.449163436889648, "learning_rate": 5e-06, "loss": 0.1725, "step": 1170 }, { "epoch": 10.0, "eval_loss": 0.10285739600658417, "eval_runtime": 9.2384, "eval_samples_per_second": 50.441, "eval_steps_per_second": 6.386, "step": 1170 }, { "epoch": 10.008547008547009, "grad_norm": 4.151456356048584, "learning_rate": 4.995726495726496e-06, "loss": 0.3336, "step": 1171 }, { "epoch": 10.017094017094017, "grad_norm": 2.38647723197937, "learning_rate": 4.991452991452992e-06, "loss": 0.1138, "step": 1172 }, { "epoch": 10.025641025641026, "grad_norm": 4.44817590713501, "learning_rate": 4.987179487179487e-06, "loss": 0.0954, "step": 1173 }, { "epoch": 10.034188034188034, "grad_norm": 2.6213347911834717, "learning_rate": 4.982905982905983e-06, "loss": 0.0695, "step": 1174 }, { "epoch": 10.042735042735043, "grad_norm": 4.664891719818115, "learning_rate": 4.978632478632479e-06, "loss": 0.1067, "step": 1175 }, { "epoch": 10.051282051282051, "grad_norm": 1.7059048414230347, "learning_rate": 4.974358974358975e-06, "loss": 0.0321, "step": 1176 }, { "epoch": 10.05982905982906, "grad_norm": 5.123709678649902, "learning_rate": 4.970085470085471e-06, "loss": 0.2117, "step": 1177 }, { "epoch": 10.068376068376068, "grad_norm": 2.2717695236206055, "learning_rate": 4.965811965811967e-06, "loss": 0.2187, "step": 1178 }, { "epoch": 10.076923076923077, "grad_norm": 4.669886112213135, "learning_rate": 4.961538461538462e-06, "loss": 0.4615, "step": 1179 }, { "epoch": 10.085470085470085, "grad_norm": 18.739727020263672, "learning_rate": 4.957264957264958e-06, "loss": 0.3431, "step": 1180 }, { "epoch": 10.094017094017094, "grad_norm": 7.798559188842773, "learning_rate": 4.9529914529914535e-06, "loss": 0.2483, "step": 1181 }, { "epoch": 10.102564102564102, "grad_norm": 22.59453773498535, "learning_rate": 4.948717948717949e-06, "loss": 0.15, "step": 1182 }, { "epoch": 10.11111111111111, "grad_norm": 2.5734364986419678, "learning_rate": 4.944444444444445e-06, "loss": 0.0465, "step": 1183 }, { "epoch": 10.11965811965812, "grad_norm": 3.1944875717163086, "learning_rate": 4.940170940170941e-06, "loss": 0.1429, "step": 1184 }, { "epoch": 10.128205128205128, "grad_norm": 1.6943906545639038, "learning_rate": 4.935897435897436e-06, "loss": 0.0685, "step": 1185 }, { "epoch": 10.136752136752136, "grad_norm": 4.497282981872559, "learning_rate": 4.931623931623932e-06, "loss": 0.2113, "step": 1186 }, { "epoch": 10.145299145299145, "grad_norm": 2.9377167224884033, "learning_rate": 4.927350427350428e-06, "loss": 0.1352, "step": 1187 }, { "epoch": 10.153846153846153, "grad_norm": 8.528215408325195, "learning_rate": 4.923076923076924e-06, "loss": 0.3268, "step": 1188 }, { "epoch": 10.162393162393162, "grad_norm": 2.143850803375244, "learning_rate": 4.918803418803419e-06, "loss": 0.0923, "step": 1189 }, { "epoch": 10.17094017094017, "grad_norm": 3.921250343322754, "learning_rate": 4.914529914529915e-06, "loss": 0.1451, "step": 1190 }, { "epoch": 10.179487179487179, "grad_norm": 10.713285446166992, "learning_rate": 4.910256410256411e-06, "loss": 0.17, "step": 1191 }, { "epoch": 10.188034188034187, "grad_norm": 2.450204849243164, "learning_rate": 4.905982905982906e-06, "loss": 0.0765, "step": 1192 }, { "epoch": 10.196581196581196, "grad_norm": 4.750647068023682, "learning_rate": 4.901709401709402e-06, "loss": 0.2829, "step": 1193 }, { "epoch": 10.205128205128204, "grad_norm": 12.714463233947754, "learning_rate": 4.8974358974358975e-06, "loss": 0.6767, "step": 1194 }, { "epoch": 10.213675213675213, "grad_norm": 6.759951591491699, "learning_rate": 4.8931623931623934e-06, "loss": 0.2369, "step": 1195 }, { "epoch": 10.222222222222221, "grad_norm": 8.592784881591797, "learning_rate": 4.888888888888889e-06, "loss": 0.4203, "step": 1196 }, { "epoch": 10.23076923076923, "grad_norm": 5.04047155380249, "learning_rate": 4.884615384615385e-06, "loss": 0.1023, "step": 1197 }, { "epoch": 10.239316239316238, "grad_norm": 38.112152099609375, "learning_rate": 4.88034188034188e-06, "loss": 0.4686, "step": 1198 }, { "epoch": 10.247863247863247, "grad_norm": 6.751104354858398, "learning_rate": 4.876068376068376e-06, "loss": 0.085, "step": 1199 }, { "epoch": 10.256410256410255, "grad_norm": 4.3117594718933105, "learning_rate": 4.871794871794872e-06, "loss": 0.1504, "step": 1200 }, { "epoch": 10.264957264957266, "grad_norm": 2.251265287399292, "learning_rate": 4.867521367521368e-06, "loss": 0.1664, "step": 1201 }, { "epoch": 10.273504273504274, "grad_norm": 2.1650373935699463, "learning_rate": 4.863247863247864e-06, "loss": 0.0959, "step": 1202 }, { "epoch": 10.282051282051283, "grad_norm": 2.5863089561462402, "learning_rate": 4.85897435897436e-06, "loss": 0.1148, "step": 1203 }, { "epoch": 10.290598290598291, "grad_norm": 1.974357008934021, "learning_rate": 4.854700854700855e-06, "loss": 0.0663, "step": 1204 }, { "epoch": 10.2991452991453, "grad_norm": 2.3226940631866455, "learning_rate": 4.850427350427351e-06, "loss": 0.1363, "step": 1205 }, { "epoch": 10.307692307692308, "grad_norm": 4.034085750579834, "learning_rate": 4.8461538461538465e-06, "loss": 0.3473, "step": 1206 }, { "epoch": 10.316239316239317, "grad_norm": 2.492307186126709, "learning_rate": 4.8418803418803424e-06, "loss": 0.1742, "step": 1207 }, { "epoch": 10.324786324786325, "grad_norm": 2.886432409286499, "learning_rate": 4.837606837606838e-06, "loss": 0.1382, "step": 1208 }, { "epoch": 10.333333333333334, "grad_norm": 3.6314749717712402, "learning_rate": 4.833333333333333e-06, "loss": 0.1556, "step": 1209 }, { "epoch": 10.341880341880342, "grad_norm": 2.2757928371429443, "learning_rate": 4.829059829059829e-06, "loss": 0.0434, "step": 1210 }, { "epoch": 10.350427350427351, "grad_norm": 3.4152615070343018, "learning_rate": 4.824786324786325e-06, "loss": 0.2903, "step": 1211 }, { "epoch": 10.35897435897436, "grad_norm": 3.873960256576538, "learning_rate": 4.820512820512821e-06, "loss": 0.2611, "step": 1212 }, { "epoch": 10.367521367521368, "grad_norm": 4.2241291999816895, "learning_rate": 4.816239316239317e-06, "loss": 0.0954, "step": 1213 }, { "epoch": 10.376068376068377, "grad_norm": 5.454725742340088, "learning_rate": 4.811965811965813e-06, "loss": 0.1361, "step": 1214 }, { "epoch": 10.384615384615385, "grad_norm": 3.482558012008667, "learning_rate": 4.807692307692308e-06, "loss": 0.0861, "step": 1215 }, { "epoch": 10.393162393162394, "grad_norm": 2.301254987716675, "learning_rate": 4.803418803418804e-06, "loss": 0.1571, "step": 1216 }, { "epoch": 10.401709401709402, "grad_norm": 6.0665602684021, "learning_rate": 4.7991452991453e-06, "loss": 0.5323, "step": 1217 }, { "epoch": 10.41025641025641, "grad_norm": 3.6052770614624023, "learning_rate": 4.7948717948717955e-06, "loss": 0.3789, "step": 1218 }, { "epoch": 10.418803418803419, "grad_norm": 3.9434757232666016, "learning_rate": 4.790598290598291e-06, "loss": 0.0605, "step": 1219 }, { "epoch": 10.427350427350428, "grad_norm": 5.260069847106934, "learning_rate": 4.786324786324787e-06, "loss": 0.3163, "step": 1220 }, { "epoch": 10.435897435897436, "grad_norm": 5.219394207000732, "learning_rate": 4.782051282051282e-06, "loss": 0.4339, "step": 1221 }, { "epoch": 10.444444444444445, "grad_norm": 2.7057230472564697, "learning_rate": 4.777777777777778e-06, "loss": 0.0787, "step": 1222 }, { "epoch": 10.452991452991453, "grad_norm": 11.005247116088867, "learning_rate": 4.773504273504274e-06, "loss": 0.255, "step": 1223 }, { "epoch": 10.461538461538462, "grad_norm": 1.7238801717758179, "learning_rate": 4.76923076923077e-06, "loss": 0.0605, "step": 1224 }, { "epoch": 10.47008547008547, "grad_norm": 6.509312629699707, "learning_rate": 4.764957264957265e-06, "loss": 0.2899, "step": 1225 }, { "epoch": 10.478632478632479, "grad_norm": 7.1476359367370605, "learning_rate": 4.760683760683761e-06, "loss": 0.336, "step": 1226 }, { "epoch": 10.487179487179487, "grad_norm": 15.92902660369873, "learning_rate": 4.756410256410257e-06, "loss": 0.4864, "step": 1227 }, { "epoch": 10.495726495726496, "grad_norm": 5.545684337615967, "learning_rate": 4.752136752136752e-06, "loss": 0.4741, "step": 1228 }, { "epoch": 10.504273504273504, "grad_norm": 3.2521066665649414, "learning_rate": 4.747863247863248e-06, "loss": 0.0894, "step": 1229 }, { "epoch": 10.512820512820513, "grad_norm": 2.696866512298584, "learning_rate": 4.743589743589744e-06, "loss": 0.111, "step": 1230 }, { "epoch": 10.521367521367521, "grad_norm": 1.8362340927124023, "learning_rate": 4.7393162393162396e-06, "loss": 0.0579, "step": 1231 }, { "epoch": 10.52991452991453, "grad_norm": 2.96872878074646, "learning_rate": 4.7350427350427355e-06, "loss": 0.0781, "step": 1232 }, { "epoch": 10.538461538461538, "grad_norm": 1.5503445863723755, "learning_rate": 4.730769230769231e-06, "loss": 0.0451, "step": 1233 }, { "epoch": 10.547008547008547, "grad_norm": 3.9600377082824707, "learning_rate": 4.726495726495726e-06, "loss": 0.1721, "step": 1234 }, { "epoch": 10.555555555555555, "grad_norm": 3.3868823051452637, "learning_rate": 4.722222222222222e-06, "loss": 0.1803, "step": 1235 }, { "epoch": 10.564102564102564, "grad_norm": 2.528111219406128, "learning_rate": 4.717948717948718e-06, "loss": 0.238, "step": 1236 }, { "epoch": 10.572649572649572, "grad_norm": 6.960350036621094, "learning_rate": 4.713675213675214e-06, "loss": 0.4353, "step": 1237 }, { "epoch": 10.581196581196581, "grad_norm": 2.3169686794281006, "learning_rate": 4.70940170940171e-06, "loss": 0.1891, "step": 1238 }, { "epoch": 10.58974358974359, "grad_norm": 2.021212577819824, "learning_rate": 4.705128205128206e-06, "loss": 0.0865, "step": 1239 }, { "epoch": 10.598290598290598, "grad_norm": 2.445462942123413, "learning_rate": 4.700854700854701e-06, "loss": 0.0973, "step": 1240 }, { "epoch": 10.606837606837606, "grad_norm": 3.4490067958831787, "learning_rate": 4.696581196581197e-06, "loss": 0.1419, "step": 1241 }, { "epoch": 10.615384615384615, "grad_norm": 3.2859914302825928, "learning_rate": 4.692307692307693e-06, "loss": 0.1587, "step": 1242 }, { "epoch": 10.623931623931623, "grad_norm": 4.754831790924072, "learning_rate": 4.6880341880341886e-06, "loss": 0.2537, "step": 1243 }, { "epoch": 10.632478632478632, "grad_norm": 3.220867156982422, "learning_rate": 4.6837606837606844e-06, "loss": 0.0941, "step": 1244 }, { "epoch": 10.64102564102564, "grad_norm": 5.699328422546387, "learning_rate": 4.6794871794871795e-06, "loss": 0.255, "step": 1245 }, { "epoch": 10.649572649572649, "grad_norm": 1.5174522399902344, "learning_rate": 4.675213675213675e-06, "loss": 0.048, "step": 1246 }, { "epoch": 10.658119658119658, "grad_norm": 2.4277050495147705, "learning_rate": 4.670940170940171e-06, "loss": 0.1127, "step": 1247 }, { "epoch": 10.666666666666666, "grad_norm": 2.079031229019165, "learning_rate": 4.666666666666667e-06, "loss": 0.1038, "step": 1248 }, { "epoch": 10.675213675213675, "grad_norm": 953.4605102539062, "learning_rate": 4.662393162393163e-06, "loss": 1.1892, "step": 1249 }, { "epoch": 10.683760683760683, "grad_norm": 9.190105438232422, "learning_rate": 4.658119658119659e-06, "loss": 0.3541, "step": 1250 }, { "epoch": 10.692307692307692, "grad_norm": 2.3222947120666504, "learning_rate": 4.653846153846154e-06, "loss": 0.0842, "step": 1251 }, { "epoch": 10.7008547008547, "grad_norm": 2.2312700748443604, "learning_rate": 4.64957264957265e-06, "loss": 0.088, "step": 1252 }, { "epoch": 10.709401709401709, "grad_norm": 3.987630844116211, "learning_rate": 4.645299145299146e-06, "loss": 0.1667, "step": 1253 }, { "epoch": 10.717948717948717, "grad_norm": 5.108981609344482, "learning_rate": 4.641025641025642e-06, "loss": 0.4291, "step": 1254 }, { "epoch": 10.726495726495726, "grad_norm": 2.8597464561462402, "learning_rate": 4.6367521367521375e-06, "loss": 0.0564, "step": 1255 }, { "epoch": 10.735042735042736, "grad_norm": 2.3642940521240234, "learning_rate": 4.6324786324786334e-06, "loss": 0.0909, "step": 1256 }, { "epoch": 10.743589743589745, "grad_norm": 1.5703462362289429, "learning_rate": 4.6282051282051285e-06, "loss": 0.0395, "step": 1257 }, { "epoch": 10.752136752136753, "grad_norm": 2.952786922454834, "learning_rate": 4.623931623931624e-06, "loss": 0.1824, "step": 1258 }, { "epoch": 10.760683760683762, "grad_norm": 2.9027185440063477, "learning_rate": 4.61965811965812e-06, "loss": 0.0765, "step": 1259 }, { "epoch": 10.76923076923077, "grad_norm": 2.4386038780212402, "learning_rate": 4.615384615384616e-06, "loss": 0.2761, "step": 1260 }, { "epoch": 10.777777777777779, "grad_norm": 7.146468639373779, "learning_rate": 4.611111111111112e-06, "loss": 0.4427, "step": 1261 }, { "epoch": 10.786324786324787, "grad_norm": 2.002096652984619, "learning_rate": 4.606837606837607e-06, "loss": 0.0879, "step": 1262 }, { "epoch": 10.794871794871796, "grad_norm": 6.504697322845459, "learning_rate": 4.602564102564103e-06, "loss": 0.1805, "step": 1263 }, { "epoch": 10.803418803418804, "grad_norm": 9.748340606689453, "learning_rate": 4.598290598290598e-06, "loss": 0.5813, "step": 1264 }, { "epoch": 10.811965811965813, "grad_norm": 3.67153000831604, "learning_rate": 4.594017094017094e-06, "loss": 0.4175, "step": 1265 }, { "epoch": 10.820512820512821, "grad_norm": 9.109044075012207, "learning_rate": 4.58974358974359e-06, "loss": 0.4505, "step": 1266 }, { "epoch": 10.82905982905983, "grad_norm": 5.419683933258057, "learning_rate": 4.585470085470086e-06, "loss": 0.2316, "step": 1267 }, { "epoch": 10.837606837606838, "grad_norm": 2.901182174682617, "learning_rate": 4.581196581196582e-06, "loss": 0.0583, "step": 1268 }, { "epoch": 10.846153846153847, "grad_norm": 4.579897403717041, "learning_rate": 4.5769230769230775e-06, "loss": 0.0536, "step": 1269 }, { "epoch": 10.854700854700855, "grad_norm": 4.232446670532227, "learning_rate": 4.5726495726495725e-06, "loss": 0.17, "step": 1270 }, { "epoch": 10.863247863247864, "grad_norm": 8.059329986572266, "learning_rate": 4.568376068376068e-06, "loss": 0.256, "step": 1271 }, { "epoch": 10.871794871794872, "grad_norm": 1.5736984014511108, "learning_rate": 4.564102564102564e-06, "loss": 0.058, "step": 1272 }, { "epoch": 10.88034188034188, "grad_norm": 5.397885799407959, "learning_rate": 4.55982905982906e-06, "loss": 0.1299, "step": 1273 }, { "epoch": 10.88888888888889, "grad_norm": 3.9831533432006836, "learning_rate": 4.555555555555556e-06, "loss": 0.1762, "step": 1274 }, { "epoch": 10.897435897435898, "grad_norm": 2.170370101928711, "learning_rate": 4.551282051282052e-06, "loss": 0.1355, "step": 1275 }, { "epoch": 10.905982905982906, "grad_norm": 5.151463508605957, "learning_rate": 4.547008547008547e-06, "loss": 0.3151, "step": 1276 }, { "epoch": 10.914529914529915, "grad_norm": 2.215559482574463, "learning_rate": 4.542735042735043e-06, "loss": 0.1054, "step": 1277 }, { "epoch": 10.923076923076923, "grad_norm": 3.62188458442688, "learning_rate": 4.538461538461539e-06, "loss": 0.3839, "step": 1278 }, { "epoch": 10.931623931623932, "grad_norm": 1.8855514526367188, "learning_rate": 4.534188034188035e-06, "loss": 0.0639, "step": 1279 }, { "epoch": 10.94017094017094, "grad_norm": 3.0260651111602783, "learning_rate": 4.5299145299145306e-06, "loss": 0.1216, "step": 1280 }, { "epoch": 10.948717948717949, "grad_norm": 13.30820083618164, "learning_rate": 4.525641025641026e-06, "loss": 0.3337, "step": 1281 }, { "epoch": 10.957264957264957, "grad_norm": 4.356720447540283, "learning_rate": 4.5213675213675215e-06, "loss": 0.2692, "step": 1282 }, { "epoch": 10.965811965811966, "grad_norm": 2.077742576599121, "learning_rate": 4.517094017094017e-06, "loss": 0.1181, "step": 1283 }, { "epoch": 10.974358974358974, "grad_norm": 6.6224284172058105, "learning_rate": 4.512820512820513e-06, "loss": 0.1526, "step": 1284 }, { "epoch": 10.982905982905983, "grad_norm": 4.072678565979004, "learning_rate": 4.508547008547009e-06, "loss": 0.1804, "step": 1285 }, { "epoch": 10.991452991452991, "grad_norm": 3.430922269821167, "learning_rate": 4.504273504273505e-06, "loss": 0.1316, "step": 1286 }, { "epoch": 11.0, "grad_norm": 1.6371959447860718, "learning_rate": 4.5e-06, "loss": 0.0596, "step": 1287 }, { "epoch": 11.0, "eval_loss": 0.08654214441776276, "eval_runtime": 9.3013, "eval_samples_per_second": 50.1, "eval_steps_per_second": 6.343, "step": 1287 }, { "epoch": 11.008547008547009, "grad_norm": 5.072701454162598, "learning_rate": 4.495726495726496e-06, "loss": 0.2195, "step": 1288 }, { "epoch": 11.017094017094017, "grad_norm": 6.791895389556885, "learning_rate": 4.491452991452992e-06, "loss": 0.5354, "step": 1289 }, { "epoch": 11.025641025641026, "grad_norm": 12.475218772888184, "learning_rate": 4.487179487179488e-06, "loss": 0.1828, "step": 1290 }, { "epoch": 11.034188034188034, "grad_norm": 5.892624855041504, "learning_rate": 4.482905982905984e-06, "loss": 0.1617, "step": 1291 }, { "epoch": 11.042735042735043, "grad_norm": 1.742074728012085, "learning_rate": 4.4786324786324796e-06, "loss": 0.0508, "step": 1292 }, { "epoch": 11.051282051282051, "grad_norm": 2.389373302459717, "learning_rate": 4.474358974358975e-06, "loss": 0.1009, "step": 1293 }, { "epoch": 11.05982905982906, "grad_norm": 3.7152106761932373, "learning_rate": 4.4700854700854705e-06, "loss": 0.2157, "step": 1294 }, { "epoch": 11.068376068376068, "grad_norm": 7.217955112457275, "learning_rate": 4.465811965811966e-06, "loss": 0.2737, "step": 1295 }, { "epoch": 11.076923076923077, "grad_norm": 2.0971977710723877, "learning_rate": 4.461538461538462e-06, "loss": 0.1273, "step": 1296 }, { "epoch": 11.085470085470085, "grad_norm": 1.1616859436035156, "learning_rate": 4.457264957264958e-06, "loss": 0.0325, "step": 1297 }, { "epoch": 11.094017094017094, "grad_norm": 3.4287424087524414, "learning_rate": 4.452991452991453e-06, "loss": 0.1136, "step": 1298 }, { "epoch": 11.102564102564102, "grad_norm": 1.6207005977630615, "learning_rate": 4.448717948717949e-06, "loss": 0.0344, "step": 1299 }, { "epoch": 11.11111111111111, "grad_norm": 3.009976863861084, "learning_rate": 4.444444444444444e-06, "loss": 0.1532, "step": 1300 }, { "epoch": 11.11965811965812, "grad_norm": 2.9768505096435547, "learning_rate": 4.44017094017094e-06, "loss": 0.0874, "step": 1301 }, { "epoch": 11.128205128205128, "grad_norm": 3.622715473175049, "learning_rate": 4.435897435897436e-06, "loss": 0.3132, "step": 1302 }, { "epoch": 11.136752136752136, "grad_norm": 3.5741326808929443, "learning_rate": 4.431623931623932e-06, "loss": 0.0914, "step": 1303 }, { "epoch": 11.145299145299145, "grad_norm": 7.436197280883789, "learning_rate": 4.427350427350428e-06, "loss": 0.329, "step": 1304 }, { "epoch": 11.153846153846153, "grad_norm": 2.390066146850586, "learning_rate": 4.423076923076924e-06, "loss": 0.0867, "step": 1305 }, { "epoch": 11.162393162393162, "grad_norm": 1.928227424621582, "learning_rate": 4.418803418803419e-06, "loss": 0.0294, "step": 1306 }, { "epoch": 11.17094017094017, "grad_norm": 4.40464448928833, "learning_rate": 4.4145299145299145e-06, "loss": 0.3704, "step": 1307 }, { "epoch": 11.179487179487179, "grad_norm": 22.183835983276367, "learning_rate": 4.4102564102564104e-06, "loss": 0.6011, "step": 1308 }, { "epoch": 11.188034188034187, "grad_norm": 2.496633768081665, "learning_rate": 4.405982905982906e-06, "loss": 0.0494, "step": 1309 }, { "epoch": 11.196581196581196, "grad_norm": 1.142687201499939, "learning_rate": 4.401709401709402e-06, "loss": 0.0292, "step": 1310 }, { "epoch": 11.205128205128204, "grad_norm": 2.0762455463409424, "learning_rate": 4.397435897435898e-06, "loss": 0.1123, "step": 1311 }, { "epoch": 11.213675213675213, "grad_norm": 1.5389565229415894, "learning_rate": 4.393162393162393e-06, "loss": 0.0316, "step": 1312 }, { "epoch": 11.222222222222221, "grad_norm": 4.252040386199951, "learning_rate": 4.388888888888889e-06, "loss": 0.0832, "step": 1313 }, { "epoch": 11.23076923076923, "grad_norm": 2.1999545097351074, "learning_rate": 4.384615384615385e-06, "loss": 0.1121, "step": 1314 }, { "epoch": 11.239316239316238, "grad_norm": 3.3256099224090576, "learning_rate": 4.380341880341881e-06, "loss": 0.1288, "step": 1315 }, { "epoch": 11.247863247863247, "grad_norm": 2.6664986610412598, "learning_rate": 4.376068376068377e-06, "loss": 0.1044, "step": 1316 }, { "epoch": 11.256410256410255, "grad_norm": 4.103114604949951, "learning_rate": 4.371794871794872e-06, "loss": 0.3115, "step": 1317 }, { "epoch": 11.264957264957266, "grad_norm": 2.717532157897949, "learning_rate": 4.367521367521368e-06, "loss": 0.1144, "step": 1318 }, { "epoch": 11.273504273504274, "grad_norm": 2.7918317317962646, "learning_rate": 4.3632478632478635e-06, "loss": 0.1205, "step": 1319 }, { "epoch": 11.282051282051283, "grad_norm": 2.439854383468628, "learning_rate": 4.358974358974359e-06, "loss": 0.05, "step": 1320 }, { "epoch": 11.290598290598291, "grad_norm": 1.3528865575790405, "learning_rate": 4.354700854700855e-06, "loss": 0.0437, "step": 1321 }, { "epoch": 11.2991452991453, "grad_norm": 3.3273401260375977, "learning_rate": 4.350427350427351e-06, "loss": 0.1417, "step": 1322 }, { "epoch": 11.307692307692308, "grad_norm": 4.022815704345703, "learning_rate": 4.346153846153846e-06, "loss": 0.0845, "step": 1323 }, { "epoch": 11.316239316239317, "grad_norm": 5.169338703155518, "learning_rate": 4.341880341880342e-06, "loss": 0.5235, "step": 1324 }, { "epoch": 11.324786324786325, "grad_norm": 1.8199687004089355, "learning_rate": 4.337606837606838e-06, "loss": 0.0399, "step": 1325 }, { "epoch": 11.333333333333334, "grad_norm": 3.3616087436676025, "learning_rate": 4.333333333333334e-06, "loss": 0.1428, "step": 1326 }, { "epoch": 11.341880341880342, "grad_norm": 14.056232452392578, "learning_rate": 4.32905982905983e-06, "loss": 0.2921, "step": 1327 }, { "epoch": 11.350427350427351, "grad_norm": 2.3905317783355713, "learning_rate": 4.324786324786326e-06, "loss": 0.0478, "step": 1328 }, { "epoch": 11.35897435897436, "grad_norm": 9.876815795898438, "learning_rate": 4.320512820512821e-06, "loss": 0.1926, "step": 1329 }, { "epoch": 11.367521367521368, "grad_norm": 1.3726049661636353, "learning_rate": 4.316239316239317e-06, "loss": 0.0416, "step": 1330 }, { "epoch": 11.376068376068377, "grad_norm": 3.0890841484069824, "learning_rate": 4.3119658119658125e-06, "loss": 0.0614, "step": 1331 }, { "epoch": 11.384615384615385, "grad_norm": 2.858560562133789, "learning_rate": 4.307692307692308e-06, "loss": 0.2068, "step": 1332 }, { "epoch": 11.393162393162394, "grad_norm": 4.6819658279418945, "learning_rate": 4.303418803418804e-06, "loss": 0.5773, "step": 1333 }, { "epoch": 11.401709401709402, "grad_norm": 1.741450548171997, "learning_rate": 4.299145299145299e-06, "loss": 0.0505, "step": 1334 }, { "epoch": 11.41025641025641, "grad_norm": 3.5882327556610107, "learning_rate": 4.294871794871795e-06, "loss": 0.1797, "step": 1335 }, { "epoch": 11.418803418803419, "grad_norm": 3.59714937210083, "learning_rate": 4.29059829059829e-06, "loss": 0.1531, "step": 1336 }, { "epoch": 11.427350427350428, "grad_norm": 3.619572877883911, "learning_rate": 4.286324786324786e-06, "loss": 0.1028, "step": 1337 }, { "epoch": 11.435897435897436, "grad_norm": 3.9230782985687256, "learning_rate": 4.282051282051282e-06, "loss": 0.2404, "step": 1338 }, { "epoch": 11.444444444444445, "grad_norm": 3.6987717151641846, "learning_rate": 4.277777777777778e-06, "loss": 0.1795, "step": 1339 }, { "epoch": 11.452991452991453, "grad_norm": 3.322707176208496, "learning_rate": 4.273504273504274e-06, "loss": 0.0968, "step": 1340 }, { "epoch": 11.461538461538462, "grad_norm": 1.2378501892089844, "learning_rate": 4.26923076923077e-06, "loss": 0.0387, "step": 1341 }, { "epoch": 11.47008547008547, "grad_norm": 2.6801578998565674, "learning_rate": 4.264957264957265e-06, "loss": 0.0475, "step": 1342 }, { "epoch": 11.478632478632479, "grad_norm": 2.2003352642059326, "learning_rate": 4.260683760683761e-06, "loss": 0.0505, "step": 1343 }, { "epoch": 11.487179487179487, "grad_norm": 1.701341152191162, "learning_rate": 4.2564102564102566e-06, "loss": 0.064, "step": 1344 }, { "epoch": 11.495726495726496, "grad_norm": 9.939803123474121, "learning_rate": 4.2521367521367524e-06, "loss": 0.461, "step": 1345 }, { "epoch": 11.504273504273504, "grad_norm": 3.2999305725097656, "learning_rate": 4.247863247863248e-06, "loss": 0.1653, "step": 1346 }, { "epoch": 11.512820512820513, "grad_norm": 3.9968252182006836, "learning_rate": 4.243589743589744e-06, "loss": 0.123, "step": 1347 }, { "epoch": 11.521367521367521, "grad_norm": 2.846968173980713, "learning_rate": 4.239316239316239e-06, "loss": 0.1161, "step": 1348 }, { "epoch": 11.52991452991453, "grad_norm": 4.328092575073242, "learning_rate": 4.235042735042735e-06, "loss": 0.065, "step": 1349 }, { "epoch": 11.538461538461538, "grad_norm": 3.649003267288208, "learning_rate": 4.230769230769231e-06, "loss": 0.1919, "step": 1350 }, { "epoch": 11.547008547008547, "grad_norm": 4.094634056091309, "learning_rate": 4.226495726495727e-06, "loss": 0.1728, "step": 1351 }, { "epoch": 11.555555555555555, "grad_norm": 2.3904240131378174, "learning_rate": 4.222222222222223e-06, "loss": 0.105, "step": 1352 }, { "epoch": 11.564102564102564, "grad_norm": 1.8493746519088745, "learning_rate": 4.217948717948718e-06, "loss": 0.0373, "step": 1353 }, { "epoch": 11.572649572649572, "grad_norm": 4.690928936004639, "learning_rate": 4.213675213675214e-06, "loss": 0.3405, "step": 1354 }, { "epoch": 11.581196581196581, "grad_norm": 6.808948516845703, "learning_rate": 4.20940170940171e-06, "loss": 0.1308, "step": 1355 }, { "epoch": 11.58974358974359, "grad_norm": 6.060946464538574, "learning_rate": 4.2051282051282055e-06, "loss": 0.1494, "step": 1356 }, { "epoch": 11.598290598290598, "grad_norm": 1.5923279523849487, "learning_rate": 4.2008547008547014e-06, "loss": 0.044, "step": 1357 }, { "epoch": 11.606837606837606, "grad_norm": 1.7796354293823242, "learning_rate": 4.196581196581197e-06, "loss": 0.0558, "step": 1358 }, { "epoch": 11.615384615384615, "grad_norm": 1.2209490537643433, "learning_rate": 4.192307692307692e-06, "loss": 0.0492, "step": 1359 }, { "epoch": 11.623931623931623, "grad_norm": 4.0859880447387695, "learning_rate": 4.188034188034188e-06, "loss": 0.0759, "step": 1360 }, { "epoch": 11.632478632478632, "grad_norm": 3.5021755695343018, "learning_rate": 4.183760683760684e-06, "loss": 0.1263, "step": 1361 }, { "epoch": 11.64102564102564, "grad_norm": 2.5915517807006836, "learning_rate": 4.17948717948718e-06, "loss": 0.1949, "step": 1362 }, { "epoch": 11.649572649572649, "grad_norm": 2.8024656772613525, "learning_rate": 4.175213675213676e-06, "loss": 0.2325, "step": 1363 }, { "epoch": 11.658119658119658, "grad_norm": 5.795172691345215, "learning_rate": 4.170940170940172e-06, "loss": 0.3253, "step": 1364 }, { "epoch": 11.666666666666666, "grad_norm": 5.056031227111816, "learning_rate": 4.166666666666667e-06, "loss": 0.102, "step": 1365 }, { "epoch": 11.675213675213675, "grad_norm": 6.092950820922852, "learning_rate": 4.162393162393163e-06, "loss": 0.1938, "step": 1366 }, { "epoch": 11.683760683760683, "grad_norm": 4.44755744934082, "learning_rate": 4.158119658119659e-06, "loss": 0.1588, "step": 1367 }, { "epoch": 11.692307692307692, "grad_norm": 171.19509887695312, "learning_rate": 4.1538461538461545e-06, "loss": 0.3077, "step": 1368 }, { "epoch": 11.7008547008547, "grad_norm": 13.992602348327637, "learning_rate": 4.1495726495726504e-06, "loss": 0.4401, "step": 1369 }, { "epoch": 11.709401709401709, "grad_norm": 2.2174923419952393, "learning_rate": 4.145299145299146e-06, "loss": 0.1751, "step": 1370 }, { "epoch": 11.717948717948717, "grad_norm": 2.031663179397583, "learning_rate": 4.141025641025641e-06, "loss": 0.049, "step": 1371 }, { "epoch": 11.726495726495726, "grad_norm": 4.201449394226074, "learning_rate": 4.136752136752136e-06, "loss": 0.1016, "step": 1372 }, { "epoch": 11.735042735042736, "grad_norm": 3.953226089477539, "learning_rate": 4.132478632478632e-06, "loss": 0.1336, "step": 1373 }, { "epoch": 11.743589743589745, "grad_norm": 1.4856081008911133, "learning_rate": 4.128205128205128e-06, "loss": 0.0537, "step": 1374 }, { "epoch": 11.752136752136753, "grad_norm": 1.2989288568496704, "learning_rate": 4.123931623931624e-06, "loss": 0.0351, "step": 1375 }, { "epoch": 11.760683760683762, "grad_norm": 4.335974216461182, "learning_rate": 4.11965811965812e-06, "loss": 0.0722, "step": 1376 }, { "epoch": 11.76923076923077, "grad_norm": 6.298306941986084, "learning_rate": 4.115384615384616e-06, "loss": 0.2359, "step": 1377 }, { "epoch": 11.777777777777779, "grad_norm": 0.7119566798210144, "learning_rate": 4.111111111111111e-06, "loss": 0.0192, "step": 1378 }, { "epoch": 11.786324786324787, "grad_norm": 2.7993624210357666, "learning_rate": 4.106837606837607e-06, "loss": 0.0605, "step": 1379 }, { "epoch": 11.794871794871796, "grad_norm": 6.566782474517822, "learning_rate": 4.102564102564103e-06, "loss": 0.3883, "step": 1380 }, { "epoch": 11.803418803418804, "grad_norm": 8.177830696105957, "learning_rate": 4.0982905982905986e-06, "loss": 0.257, "step": 1381 }, { "epoch": 11.811965811965813, "grad_norm": 4.04230260848999, "learning_rate": 4.0940170940170945e-06, "loss": 0.0943, "step": 1382 }, { "epoch": 11.820512820512821, "grad_norm": 3.595386505126953, "learning_rate": 4.08974358974359e-06, "loss": 0.0533, "step": 1383 }, { "epoch": 11.82905982905983, "grad_norm": 3.755312204360962, "learning_rate": 4.085470085470085e-06, "loss": 0.0468, "step": 1384 }, { "epoch": 11.837606837606838, "grad_norm": 2.0697362422943115, "learning_rate": 4.081196581196581e-06, "loss": 0.063, "step": 1385 }, { "epoch": 11.846153846153847, "grad_norm": 7.690021991729736, "learning_rate": 4.076923076923077e-06, "loss": 0.2415, "step": 1386 }, { "epoch": 11.854700854700855, "grad_norm": 3.0239031314849854, "learning_rate": 4.072649572649573e-06, "loss": 0.1257, "step": 1387 }, { "epoch": 11.863247863247864, "grad_norm": 2.263847589492798, "learning_rate": 4.068376068376069e-06, "loss": 0.132, "step": 1388 }, { "epoch": 11.871794871794872, "grad_norm": 2.9513261318206787, "learning_rate": 4.064102564102565e-06, "loss": 0.1229, "step": 1389 }, { "epoch": 11.88034188034188, "grad_norm": 3.03973388671875, "learning_rate": 4.05982905982906e-06, "loss": 0.0966, "step": 1390 }, { "epoch": 11.88888888888889, "grad_norm": 1.0075026750564575, "learning_rate": 4.055555555555556e-06, "loss": 0.0284, "step": 1391 }, { "epoch": 11.897435897435898, "grad_norm": 1.5330802202224731, "learning_rate": 4.051282051282052e-06, "loss": 0.0614, "step": 1392 }, { "epoch": 11.905982905982906, "grad_norm": 3.6498589515686035, "learning_rate": 4.0470085470085476e-06, "loss": 0.2236, "step": 1393 }, { "epoch": 11.914529914529915, "grad_norm": 4.659658908843994, "learning_rate": 4.0427350427350435e-06, "loss": 0.3245, "step": 1394 }, { "epoch": 11.923076923076923, "grad_norm": 3.921703815460205, "learning_rate": 4.0384615384615385e-06, "loss": 0.2981, "step": 1395 }, { "epoch": 11.931623931623932, "grad_norm": 5.816749572753906, "learning_rate": 4.034188034188034e-06, "loss": 0.1606, "step": 1396 }, { "epoch": 11.94017094017094, "grad_norm": 1.2831742763519287, "learning_rate": 4.02991452991453e-06, "loss": 0.0307, "step": 1397 }, { "epoch": 11.948717948717949, "grad_norm": 5.745227813720703, "learning_rate": 4.025641025641026e-06, "loss": 0.5323, "step": 1398 }, { "epoch": 11.957264957264957, "grad_norm": 2.4196462631225586, "learning_rate": 4.021367521367522e-06, "loss": 0.09, "step": 1399 }, { "epoch": 11.965811965811966, "grad_norm": 8.084505081176758, "learning_rate": 4.017094017094018e-06, "loss": 0.2991, "step": 1400 }, { "epoch": 11.974358974358974, "grad_norm": 3.786708116531372, "learning_rate": 4.012820512820513e-06, "loss": 0.2163, "step": 1401 }, { "epoch": 11.982905982905983, "grad_norm": 4.76535701751709, "learning_rate": 4.008547008547009e-06, "loss": 0.2453, "step": 1402 }, { "epoch": 11.991452991452991, "grad_norm": 7.380269527435303, "learning_rate": 4.004273504273505e-06, "loss": 0.3525, "step": 1403 }, { "epoch": 12.0, "grad_norm": 41.21335983276367, "learning_rate": 4.000000000000001e-06, "loss": 0.2139, "step": 1404 }, { "epoch": 12.0, "eval_loss": 0.07730000466108322, "eval_runtime": 9.2426, "eval_samples_per_second": 50.419, "eval_steps_per_second": 6.383, "step": 1404 }, { "epoch": 12.008547008547009, "grad_norm": 2.3692574501037598, "learning_rate": 3.9957264957264966e-06, "loss": 0.0939, "step": 1405 }, { "epoch": 12.017094017094017, "grad_norm": 8.087658882141113, "learning_rate": 3.9914529914529924e-06, "loss": 0.2801, "step": 1406 }, { "epoch": 12.025641025641026, "grad_norm": 8.448614120483398, "learning_rate": 3.9871794871794875e-06, "loss": 0.2069, "step": 1407 }, { "epoch": 12.034188034188034, "grad_norm": 1.8581651449203491, "learning_rate": 3.982905982905983e-06, "loss": 0.0509, "step": 1408 }, { "epoch": 12.042735042735043, "grad_norm": 1.711654543876648, "learning_rate": 3.9786324786324784e-06, "loss": 0.0464, "step": 1409 }, { "epoch": 12.051282051282051, "grad_norm": 1.482553482055664, "learning_rate": 3.974358974358974e-06, "loss": 0.028, "step": 1410 }, { "epoch": 12.05982905982906, "grad_norm": 8.005542755126953, "learning_rate": 3.97008547008547e-06, "loss": 0.2587, "step": 1411 }, { "epoch": 12.068376068376068, "grad_norm": 2.1153948307037354, "learning_rate": 3.965811965811966e-06, "loss": 0.0563, "step": 1412 }, { "epoch": 12.076923076923077, "grad_norm": 7.791186809539795, "learning_rate": 3.961538461538462e-06, "loss": 0.0587, "step": 1413 }, { "epoch": 12.085470085470085, "grad_norm": 21.04537582397461, "learning_rate": 3.957264957264957e-06, "loss": 0.252, "step": 1414 }, { "epoch": 12.094017094017094, "grad_norm": 3.144742727279663, "learning_rate": 3.952991452991453e-06, "loss": 0.2207, "step": 1415 }, { "epoch": 12.102564102564102, "grad_norm": 2.23223614692688, "learning_rate": 3.948717948717949e-06, "loss": 0.0923, "step": 1416 }, { "epoch": 12.11111111111111, "grad_norm": 3.5652217864990234, "learning_rate": 3.944444444444445e-06, "loss": 0.2197, "step": 1417 }, { "epoch": 12.11965811965812, "grad_norm": 3.1105499267578125, "learning_rate": 3.940170940170941e-06, "loss": 0.071, "step": 1418 }, { "epoch": 12.128205128205128, "grad_norm": 2.525405168533325, "learning_rate": 3.9358974358974365e-06, "loss": 0.0874, "step": 1419 }, { "epoch": 12.136752136752136, "grad_norm": 4.479174613952637, "learning_rate": 3.9316239316239315e-06, "loss": 0.1872, "step": 1420 }, { "epoch": 12.145299145299145, "grad_norm": 2.0484113693237305, "learning_rate": 3.927350427350427e-06, "loss": 0.0739, "step": 1421 }, { "epoch": 12.153846153846153, "grad_norm": 2.014679431915283, "learning_rate": 3.923076923076923e-06, "loss": 0.1089, "step": 1422 }, { "epoch": 12.162393162393162, "grad_norm": 4.71014404296875, "learning_rate": 3.918803418803419e-06, "loss": 0.3136, "step": 1423 }, { "epoch": 12.17094017094017, "grad_norm": 2.1372437477111816, "learning_rate": 3.914529914529915e-06, "loss": 0.0458, "step": 1424 }, { "epoch": 12.179487179487179, "grad_norm": 1.4595564603805542, "learning_rate": 3.910256410256411e-06, "loss": 0.0601, "step": 1425 }, { "epoch": 12.188034188034187, "grad_norm": 4.45602560043335, "learning_rate": 3.905982905982906e-06, "loss": 0.091, "step": 1426 }, { "epoch": 12.196581196581196, "grad_norm": 1.473585844039917, "learning_rate": 3.901709401709402e-06, "loss": 0.0515, "step": 1427 }, { "epoch": 12.205128205128204, "grad_norm": 1.8761534690856934, "learning_rate": 3.897435897435898e-06, "loss": 0.055, "step": 1428 }, { "epoch": 12.213675213675213, "grad_norm": 0.7121579647064209, "learning_rate": 3.893162393162394e-06, "loss": 0.0197, "step": 1429 }, { "epoch": 12.222222222222221, "grad_norm": 2.0035219192504883, "learning_rate": 3.88888888888889e-06, "loss": 0.0904, "step": 1430 }, { "epoch": 12.23076923076923, "grad_norm": 3.820181369781494, "learning_rate": 3.884615384615385e-06, "loss": 0.2415, "step": 1431 }, { "epoch": 12.239316239316238, "grad_norm": 3.40633225440979, "learning_rate": 3.8803418803418805e-06, "loss": 0.0593, "step": 1432 }, { "epoch": 12.247863247863247, "grad_norm": 7.093897342681885, "learning_rate": 3.876068376068376e-06, "loss": 0.2504, "step": 1433 }, { "epoch": 12.256410256410255, "grad_norm": 2.1057517528533936, "learning_rate": 3.871794871794872e-06, "loss": 0.0573, "step": 1434 }, { "epoch": 12.264957264957266, "grad_norm": 4.797401428222656, "learning_rate": 3.867521367521368e-06, "loss": 0.338, "step": 1435 }, { "epoch": 12.273504273504274, "grad_norm": 20.711339950561523, "learning_rate": 3.863247863247864e-06, "loss": 0.1964, "step": 1436 }, { "epoch": 12.282051282051283, "grad_norm": 2.725280523300171, "learning_rate": 3.858974358974359e-06, "loss": 0.1837, "step": 1437 }, { "epoch": 12.290598290598291, "grad_norm": 0.9469479322433472, "learning_rate": 3.854700854700855e-06, "loss": 0.0231, "step": 1438 }, { "epoch": 12.2991452991453, "grad_norm": 2.0424935817718506, "learning_rate": 3.850427350427351e-06, "loss": 0.1373, "step": 1439 }, { "epoch": 12.307692307692308, "grad_norm": 1.4781558513641357, "learning_rate": 3.846153846153847e-06, "loss": 0.0393, "step": 1440 }, { "epoch": 12.316239316239317, "grad_norm": 3.7576427459716797, "learning_rate": 3.841880341880343e-06, "loss": 0.1134, "step": 1441 }, { "epoch": 12.324786324786325, "grad_norm": 299.5986633300781, "learning_rate": 3.8376068376068386e-06, "loss": 0.8017, "step": 1442 }, { "epoch": 12.333333333333334, "grad_norm": 3.109199047088623, "learning_rate": 3.833333333333334e-06, "loss": 0.1014, "step": 1443 }, { "epoch": 12.341880341880342, "grad_norm": 6.353960990905762, "learning_rate": 3.8290598290598295e-06, "loss": 0.3484, "step": 1444 }, { "epoch": 12.350427350427351, "grad_norm": 12.957517623901367, "learning_rate": 3.8247863247863246e-06, "loss": 0.5644, "step": 1445 }, { "epoch": 12.35897435897436, "grad_norm": 10.197676658630371, "learning_rate": 3.8205128205128204e-06, "loss": 0.1525, "step": 1446 }, { "epoch": 12.367521367521368, "grad_norm": 1.7754546403884888, "learning_rate": 3.816239316239316e-06, "loss": 0.0259, "step": 1447 }, { "epoch": 12.376068376068377, "grad_norm": 1.4237226247787476, "learning_rate": 3.8119658119658122e-06, "loss": 0.0307, "step": 1448 }, { "epoch": 12.384615384615385, "grad_norm": 2.94474458694458, "learning_rate": 3.8076923076923077e-06, "loss": 0.1447, "step": 1449 }, { "epoch": 12.393162393162394, "grad_norm": 3.7823615074157715, "learning_rate": 3.8034188034188036e-06, "loss": 0.071, "step": 1450 }, { "epoch": 12.401709401709402, "grad_norm": 7.5281081199646, "learning_rate": 3.7991452991452995e-06, "loss": 0.1805, "step": 1451 }, { "epoch": 12.41025641025641, "grad_norm": 2.523592233657837, "learning_rate": 3.794871794871795e-06, "loss": 0.0684, "step": 1452 }, { "epoch": 12.418803418803419, "grad_norm": 2.423443078994751, "learning_rate": 3.790598290598291e-06, "loss": 0.0726, "step": 1453 }, { "epoch": 12.427350427350428, "grad_norm": 6.3336005210876465, "learning_rate": 3.7863247863247863e-06, "loss": 0.1684, "step": 1454 }, { "epoch": 12.435897435897436, "grad_norm": 248.31146240234375, "learning_rate": 3.782051282051282e-06, "loss": 0.6863, "step": 1455 }, { "epoch": 12.444444444444445, "grad_norm": 3.0117695331573486, "learning_rate": 3.777777777777778e-06, "loss": 0.217, "step": 1456 }, { "epoch": 12.452991452991453, "grad_norm": 1.4753539562225342, "learning_rate": 3.7735042735042735e-06, "loss": 0.0623, "step": 1457 }, { "epoch": 12.461538461538462, "grad_norm": 2.095745325088501, "learning_rate": 3.7692307692307694e-06, "loss": 0.055, "step": 1458 }, { "epoch": 12.47008547008547, "grad_norm": 3.508305788040161, "learning_rate": 3.7649572649572653e-06, "loss": 0.1097, "step": 1459 }, { "epoch": 12.478632478632479, "grad_norm": 3.0965282917022705, "learning_rate": 3.760683760683761e-06, "loss": 0.3374, "step": 1460 }, { "epoch": 12.487179487179487, "grad_norm": 0.7286785244941711, "learning_rate": 3.7564102564102567e-06, "loss": 0.0182, "step": 1461 }, { "epoch": 12.495726495726496, "grad_norm": 5.957888126373291, "learning_rate": 3.7521367521367526e-06, "loss": 0.3498, "step": 1462 }, { "epoch": 12.504273504273504, "grad_norm": 10.433263778686523, "learning_rate": 3.747863247863248e-06, "loss": 0.446, "step": 1463 }, { "epoch": 12.512820512820513, "grad_norm": 4.565568923950195, "learning_rate": 3.743589743589744e-06, "loss": 0.1026, "step": 1464 }, { "epoch": 12.521367521367521, "grad_norm": 2.607106924057007, "learning_rate": 3.73931623931624e-06, "loss": 0.0912, "step": 1465 }, { "epoch": 12.52991452991453, "grad_norm": 2.415541410446167, "learning_rate": 3.7350427350427353e-06, "loss": 0.0594, "step": 1466 }, { "epoch": 12.538461538461538, "grad_norm": 7.978870868682861, "learning_rate": 3.730769230769231e-06, "loss": 0.2617, "step": 1467 }, { "epoch": 12.547008547008547, "grad_norm": 6.858293056488037, "learning_rate": 3.726495726495727e-06, "loss": 0.3642, "step": 1468 }, { "epoch": 12.555555555555555, "grad_norm": 1.3900551795959473, "learning_rate": 3.7222222222222225e-06, "loss": 0.0445, "step": 1469 }, { "epoch": 12.564102564102564, "grad_norm": 8.111970901489258, "learning_rate": 3.7179487179487184e-06, "loss": 0.1828, "step": 1470 }, { "epoch": 12.572649572649572, "grad_norm": 2.731841802597046, "learning_rate": 3.7136752136752143e-06, "loss": 0.2027, "step": 1471 }, { "epoch": 12.581196581196581, "grad_norm": 4.418527126312256, "learning_rate": 3.7094017094017098e-06, "loss": 0.1744, "step": 1472 }, { "epoch": 12.58974358974359, "grad_norm": 2.8263015747070312, "learning_rate": 3.7051282051282057e-06, "loss": 0.1123, "step": 1473 }, { "epoch": 12.598290598290598, "grad_norm": 2.3524725437164307, "learning_rate": 3.700854700854701e-06, "loss": 0.0999, "step": 1474 }, { "epoch": 12.606837606837606, "grad_norm": 9.863709449768066, "learning_rate": 3.696581196581197e-06, "loss": 0.4589, "step": 1475 }, { "epoch": 12.615384615384615, "grad_norm": 3.5506396293640137, "learning_rate": 3.692307692307693e-06, "loss": 0.2034, "step": 1476 }, { "epoch": 12.623931623931623, "grad_norm": 2.4352779388427734, "learning_rate": 3.6880341880341884e-06, "loss": 0.0806, "step": 1477 }, { "epoch": 12.632478632478632, "grad_norm": 1.8339797258377075, "learning_rate": 3.6837606837606843e-06, "loss": 0.0635, "step": 1478 }, { "epoch": 12.64102564102564, "grad_norm": 4.63474178314209, "learning_rate": 3.67948717948718e-06, "loss": 0.4568, "step": 1479 }, { "epoch": 12.649572649572649, "grad_norm": 7.696872711181641, "learning_rate": 3.6752136752136756e-06, "loss": 0.1769, "step": 1480 }, { "epoch": 12.658119658119658, "grad_norm": 1.3894271850585938, "learning_rate": 3.670940170940171e-06, "loss": 0.0747, "step": 1481 }, { "epoch": 12.666666666666666, "grad_norm": 5.607828140258789, "learning_rate": 3.6666666666666666e-06, "loss": 0.1178, "step": 1482 }, { "epoch": 12.675213675213675, "grad_norm": 2.120594024658203, "learning_rate": 3.6623931623931625e-06, "loss": 0.0497, "step": 1483 }, { "epoch": 12.683760683760683, "grad_norm": 1.359381914138794, "learning_rate": 3.6581196581196584e-06, "loss": 0.035, "step": 1484 }, { "epoch": 12.692307692307692, "grad_norm": 2.8533923625946045, "learning_rate": 3.653846153846154e-06, "loss": 0.1048, "step": 1485 }, { "epoch": 12.7008547008547, "grad_norm": 6.021198749542236, "learning_rate": 3.6495726495726497e-06, "loss": 0.1604, "step": 1486 }, { "epoch": 12.709401709401709, "grad_norm": 7.198216915130615, "learning_rate": 3.6452991452991456e-06, "loss": 0.1656, "step": 1487 }, { "epoch": 12.717948717948717, "grad_norm": 1.4581981897354126, "learning_rate": 3.641025641025641e-06, "loss": 0.0398, "step": 1488 }, { "epoch": 12.726495726495726, "grad_norm": 30.704627990722656, "learning_rate": 3.636752136752137e-06, "loss": 0.3371, "step": 1489 }, { "epoch": 12.735042735042736, "grad_norm": 2.5204057693481445, "learning_rate": 3.632478632478633e-06, "loss": 0.0742, "step": 1490 }, { "epoch": 12.743589743589745, "grad_norm": 2.3917508125305176, "learning_rate": 3.6282051282051283e-06, "loss": 0.1681, "step": 1491 }, { "epoch": 12.752136752136753, "grad_norm": 1.4529337882995605, "learning_rate": 3.623931623931624e-06, "loss": 0.0247, "step": 1492 }, { "epoch": 12.760683760683762, "grad_norm": 31.894805908203125, "learning_rate": 3.6196581196581197e-06, "loss": 0.2222, "step": 1493 }, { "epoch": 12.76923076923077, "grad_norm": 3.4240164756774902, "learning_rate": 3.6153846153846156e-06, "loss": 0.1432, "step": 1494 }, { "epoch": 12.777777777777779, "grad_norm": 2.0000102519989014, "learning_rate": 3.6111111111111115e-06, "loss": 0.0383, "step": 1495 }, { "epoch": 12.786324786324787, "grad_norm": 3.7665908336639404, "learning_rate": 3.606837606837607e-06, "loss": 0.2719, "step": 1496 }, { "epoch": 12.794871794871796, "grad_norm": 2.0319290161132812, "learning_rate": 3.602564102564103e-06, "loss": 0.0741, "step": 1497 }, { "epoch": 12.803418803418804, "grad_norm": 2.3379619121551514, "learning_rate": 3.5982905982905987e-06, "loss": 0.1155, "step": 1498 }, { "epoch": 12.811965811965813, "grad_norm": 5.183985233306885, "learning_rate": 3.594017094017094e-06, "loss": 0.0815, "step": 1499 }, { "epoch": 12.820512820512821, "grad_norm": 3.1432502269744873, "learning_rate": 3.58974358974359e-06, "loss": 0.1855, "step": 1500 }, { "epoch": 12.82905982905983, "grad_norm": 4.5739946365356445, "learning_rate": 3.585470085470086e-06, "loss": 0.1424, "step": 1501 }, { "epoch": 12.837606837606838, "grad_norm": 1.6006520986557007, "learning_rate": 3.5811965811965814e-06, "loss": 0.0305, "step": 1502 }, { "epoch": 12.846153846153847, "grad_norm": 3.937011241912842, "learning_rate": 3.5769230769230773e-06, "loss": 0.2497, "step": 1503 }, { "epoch": 12.854700854700855, "grad_norm": 2.6159651279449463, "learning_rate": 3.572649572649573e-06, "loss": 0.1067, "step": 1504 }, { "epoch": 12.863247863247864, "grad_norm": 2.578547239303589, "learning_rate": 3.5683760683760687e-06, "loss": 0.0663, "step": 1505 }, { "epoch": 12.871794871794872, "grad_norm": 2.3777639865875244, "learning_rate": 3.5641025641025646e-06, "loss": 0.0558, "step": 1506 }, { "epoch": 12.88034188034188, "grad_norm": 7.5656561851501465, "learning_rate": 3.5598290598290604e-06, "loss": 0.2448, "step": 1507 }, { "epoch": 12.88888888888889, "grad_norm": 4.21798849105835, "learning_rate": 3.555555555555556e-06, "loss": 0.1916, "step": 1508 }, { "epoch": 12.897435897435898, "grad_norm": 1.318049669265747, "learning_rate": 3.551282051282052e-06, "loss": 0.0387, "step": 1509 }, { "epoch": 12.905982905982906, "grad_norm": 2.4345362186431885, "learning_rate": 3.5470085470085473e-06, "loss": 0.061, "step": 1510 }, { "epoch": 12.914529914529915, "grad_norm": 3.2767112255096436, "learning_rate": 3.542735042735043e-06, "loss": 0.1627, "step": 1511 }, { "epoch": 12.923076923076923, "grad_norm": 6.881056785583496, "learning_rate": 3.538461538461539e-06, "loss": 0.2452, "step": 1512 }, { "epoch": 12.931623931623932, "grad_norm": 8.017362594604492, "learning_rate": 3.5341880341880345e-06, "loss": 0.1972, "step": 1513 }, { "epoch": 12.94017094017094, "grad_norm": 1.1411398649215698, "learning_rate": 3.5299145299145304e-06, "loss": 0.0243, "step": 1514 }, { "epoch": 12.948717948717949, "grad_norm": 4.486563205718994, "learning_rate": 3.5256410256410263e-06, "loss": 0.1347, "step": 1515 }, { "epoch": 12.957264957264957, "grad_norm": 2.348222494125366, "learning_rate": 3.5213675213675218e-06, "loss": 0.1828, "step": 1516 }, { "epoch": 12.965811965811966, "grad_norm": 2.2855775356292725, "learning_rate": 3.5170940170940177e-06, "loss": 0.0465, "step": 1517 }, { "epoch": 12.974358974358974, "grad_norm": 10.313456535339355, "learning_rate": 3.5128205128205127e-06, "loss": 0.3033, "step": 1518 }, { "epoch": 12.982905982905983, "grad_norm": 12.115890502929688, "learning_rate": 3.5085470085470086e-06, "loss": 0.6762, "step": 1519 }, { "epoch": 12.991452991452991, "grad_norm": 2.746267557144165, "learning_rate": 3.5042735042735045e-06, "loss": 0.123, "step": 1520 }, { "epoch": 13.0, "grad_norm": 5.204991340637207, "learning_rate": 3.5e-06, "loss": 0.2086, "step": 1521 }, { "epoch": 13.0, "eval_loss": 0.06878729909658432, "eval_runtime": 9.2334, "eval_samples_per_second": 50.469, "eval_steps_per_second": 6.39, "step": 1521 }, { "epoch": 13.008547008547009, "grad_norm": 1.8741862773895264, "learning_rate": 3.495726495726496e-06, "loss": 0.0594, "step": 1522 }, { "epoch": 13.017094017094017, "grad_norm": 1.6060154438018799, "learning_rate": 3.4914529914529917e-06, "loss": 0.0426, "step": 1523 }, { "epoch": 13.025641025641026, "grad_norm": 2.194714069366455, "learning_rate": 3.487179487179487e-06, "loss": 0.1907, "step": 1524 }, { "epoch": 13.034188034188034, "grad_norm": 0.716149628162384, "learning_rate": 3.482905982905983e-06, "loss": 0.0177, "step": 1525 }, { "epoch": 13.042735042735043, "grad_norm": 4.787989139556885, "learning_rate": 3.478632478632479e-06, "loss": 0.246, "step": 1526 }, { "epoch": 13.051282051282051, "grad_norm": 1.662338137626648, "learning_rate": 3.4743589743589744e-06, "loss": 0.0561, "step": 1527 }, { "epoch": 13.05982905982906, "grad_norm": 0.9663236737251282, "learning_rate": 3.4700854700854703e-06, "loss": 0.0392, "step": 1528 }, { "epoch": 13.068376068376068, "grad_norm": 0.8232766389846802, "learning_rate": 3.465811965811966e-06, "loss": 0.0221, "step": 1529 }, { "epoch": 13.076923076923077, "grad_norm": 2.434157609939575, "learning_rate": 3.4615384615384617e-06, "loss": 0.1777, "step": 1530 }, { "epoch": 13.085470085470085, "grad_norm": 2.768070936203003, "learning_rate": 3.4572649572649576e-06, "loss": 0.1101, "step": 1531 }, { "epoch": 13.094017094017094, "grad_norm": 2.061371088027954, "learning_rate": 3.452991452991453e-06, "loss": 0.0591, "step": 1532 }, { "epoch": 13.102564102564102, "grad_norm": 1.6127598285675049, "learning_rate": 3.448717948717949e-06, "loss": 0.3858, "step": 1533 }, { "epoch": 13.11111111111111, "grad_norm": 1.2561885118484497, "learning_rate": 3.444444444444445e-06, "loss": 0.0315, "step": 1534 }, { "epoch": 13.11965811965812, "grad_norm": 2.2859408855438232, "learning_rate": 3.4401709401709403e-06, "loss": 0.047, "step": 1535 }, { "epoch": 13.128205128205128, "grad_norm": 3.7528388500213623, "learning_rate": 3.435897435897436e-06, "loss": 0.1069, "step": 1536 }, { "epoch": 13.136752136752136, "grad_norm": 5.547614574432373, "learning_rate": 3.431623931623932e-06, "loss": 0.1411, "step": 1537 }, { "epoch": 13.145299145299145, "grad_norm": 1.6566565036773682, "learning_rate": 3.4273504273504275e-06, "loss": 0.0266, "step": 1538 }, { "epoch": 13.153846153846153, "grad_norm": 5.280163288116455, "learning_rate": 3.4230769230769234e-06, "loss": 0.0843, "step": 1539 }, { "epoch": 13.162393162393162, "grad_norm": 6.624744892120361, "learning_rate": 3.4188034188034193e-06, "loss": 0.1652, "step": 1540 }, { "epoch": 13.17094017094017, "grad_norm": 5.325616359710693, "learning_rate": 3.414529914529915e-06, "loss": 0.077, "step": 1541 }, { "epoch": 13.179487179487179, "grad_norm": 11.31779956817627, "learning_rate": 3.4102564102564107e-06, "loss": 0.4377, "step": 1542 }, { "epoch": 13.188034188034187, "grad_norm": 4.86885404586792, "learning_rate": 3.4059829059829066e-06, "loss": 0.2312, "step": 1543 }, { "epoch": 13.196581196581196, "grad_norm": 1.779068112373352, "learning_rate": 3.401709401709402e-06, "loss": 0.032, "step": 1544 }, { "epoch": 13.205128205128204, "grad_norm": 1.9934108257293701, "learning_rate": 3.397435897435898e-06, "loss": 0.0861, "step": 1545 }, { "epoch": 13.213675213675213, "grad_norm": 2.1829612255096436, "learning_rate": 3.3931623931623934e-06, "loss": 0.0855, "step": 1546 }, { "epoch": 13.222222222222221, "grad_norm": 31.108810424804688, "learning_rate": 3.3888888888888893e-06, "loss": 0.334, "step": 1547 }, { "epoch": 13.23076923076923, "grad_norm": 4.867705345153809, "learning_rate": 3.384615384615385e-06, "loss": 0.0808, "step": 1548 }, { "epoch": 13.239316239316238, "grad_norm": 3.226783275604248, "learning_rate": 3.3803418803418806e-06, "loss": 0.1806, "step": 1549 }, { "epoch": 13.247863247863247, "grad_norm": 1.4822824001312256, "learning_rate": 3.3760683760683765e-06, "loss": 0.0602, "step": 1550 }, { "epoch": 13.256410256410255, "grad_norm": 4.529379844665527, "learning_rate": 3.3717948717948724e-06, "loss": 0.318, "step": 1551 }, { "epoch": 13.264957264957266, "grad_norm": 3.2155706882476807, "learning_rate": 3.367521367521368e-06, "loss": 0.1006, "step": 1552 }, { "epoch": 13.273504273504274, "grad_norm": 2.2805707454681396, "learning_rate": 3.3632478632478638e-06, "loss": 0.0774, "step": 1553 }, { "epoch": 13.282051282051283, "grad_norm": 11.477370262145996, "learning_rate": 3.358974358974359e-06, "loss": 0.8342, "step": 1554 }, { "epoch": 13.290598290598291, "grad_norm": 3.8596534729003906, "learning_rate": 3.3547008547008547e-06, "loss": 0.1924, "step": 1555 }, { "epoch": 13.2991452991453, "grad_norm": 4.497336387634277, "learning_rate": 3.3504273504273506e-06, "loss": 0.2425, "step": 1556 }, { "epoch": 13.307692307692308, "grad_norm": 1.4496978521347046, "learning_rate": 3.346153846153846e-06, "loss": 0.0168, "step": 1557 }, { "epoch": 13.316239316239317, "grad_norm": 2.0277416706085205, "learning_rate": 3.341880341880342e-06, "loss": 0.0634, "step": 1558 }, { "epoch": 13.324786324786325, "grad_norm": 2.9120066165924072, "learning_rate": 3.337606837606838e-06, "loss": 0.1153, "step": 1559 }, { "epoch": 13.333333333333334, "grad_norm": 4.949625015258789, "learning_rate": 3.3333333333333333e-06, "loss": 0.1412, "step": 1560 }, { "epoch": 13.341880341880342, "grad_norm": 5.970853805541992, "learning_rate": 3.3290598290598292e-06, "loss": 0.1607, "step": 1561 }, { "epoch": 13.350427350427351, "grad_norm": 2.1988022327423096, "learning_rate": 3.324786324786325e-06, "loss": 0.0329, "step": 1562 }, { "epoch": 13.35897435897436, "grad_norm": 2.3578758239746094, "learning_rate": 3.3205128205128206e-06, "loss": 0.0711, "step": 1563 }, { "epoch": 13.367521367521368, "grad_norm": 4.554023742675781, "learning_rate": 3.3162393162393165e-06, "loss": 0.1929, "step": 1564 }, { "epoch": 13.376068376068377, "grad_norm": 3.577073335647583, "learning_rate": 3.311965811965812e-06, "loss": 0.0969, "step": 1565 }, { "epoch": 13.384615384615385, "grad_norm": 3.3863015174865723, "learning_rate": 3.307692307692308e-06, "loss": 0.2402, "step": 1566 }, { "epoch": 13.393162393162394, "grad_norm": 1.044550895690918, "learning_rate": 3.3034188034188037e-06, "loss": 0.026, "step": 1567 }, { "epoch": 13.401709401709402, "grad_norm": 3.1525843143463135, "learning_rate": 3.299145299145299e-06, "loss": 0.0619, "step": 1568 }, { "epoch": 13.41025641025641, "grad_norm": 2.0380606651306152, "learning_rate": 3.294871794871795e-06, "loss": 0.0477, "step": 1569 }, { "epoch": 13.418803418803419, "grad_norm": 2.4260973930358887, "learning_rate": 3.290598290598291e-06, "loss": 0.0709, "step": 1570 }, { "epoch": 13.427350427350428, "grad_norm": 20.958803176879883, "learning_rate": 3.2863247863247864e-06, "loss": 0.2297, "step": 1571 }, { "epoch": 13.435897435897436, "grad_norm": 2.847252368927002, "learning_rate": 3.2820512820512823e-06, "loss": 0.0565, "step": 1572 }, { "epoch": 13.444444444444445, "grad_norm": 3.646381139755249, "learning_rate": 3.277777777777778e-06, "loss": 0.3043, "step": 1573 }, { "epoch": 13.452991452991453, "grad_norm": 3.0526609420776367, "learning_rate": 3.2735042735042737e-06, "loss": 0.0941, "step": 1574 }, { "epoch": 13.461538461538462, "grad_norm": 1.6154388189315796, "learning_rate": 3.2692307692307696e-06, "loss": 0.0597, "step": 1575 }, { "epoch": 13.47008547008547, "grad_norm": 1.0825392007827759, "learning_rate": 3.2649572649572655e-06, "loss": 0.0325, "step": 1576 }, { "epoch": 13.478632478632479, "grad_norm": 6.045910358428955, "learning_rate": 3.260683760683761e-06, "loss": 0.2202, "step": 1577 }, { "epoch": 13.487179487179487, "grad_norm": 3.0401153564453125, "learning_rate": 3.256410256410257e-06, "loss": 0.0923, "step": 1578 }, { "epoch": 13.495726495726496, "grad_norm": 5.485551834106445, "learning_rate": 3.2521367521367527e-06, "loss": 0.3851, "step": 1579 }, { "epoch": 13.504273504273504, "grad_norm": 2.575057029724121, "learning_rate": 3.247863247863248e-06, "loss": 0.0307, "step": 1580 }, { "epoch": 13.512820512820513, "grad_norm": 2.7744545936584473, "learning_rate": 3.243589743589744e-06, "loss": 0.1791, "step": 1581 }, { "epoch": 13.521367521367521, "grad_norm": 2.430640459060669, "learning_rate": 3.2393162393162395e-06, "loss": 0.1128, "step": 1582 }, { "epoch": 13.52991452991453, "grad_norm": 4.902276992797852, "learning_rate": 3.2350427350427354e-06, "loss": 0.2661, "step": 1583 }, { "epoch": 13.538461538461538, "grad_norm": 2.601134777069092, "learning_rate": 3.2307692307692313e-06, "loss": 0.1311, "step": 1584 }, { "epoch": 13.547008547008547, "grad_norm": 6.309877395629883, "learning_rate": 3.2264957264957268e-06, "loss": 0.2621, "step": 1585 }, { "epoch": 13.555555555555555, "grad_norm": 2.079618215560913, "learning_rate": 3.2222222222222227e-06, "loss": 0.0702, "step": 1586 }, { "epoch": 13.564102564102564, "grad_norm": 2.309541702270508, "learning_rate": 3.2179487179487186e-06, "loss": 0.1577, "step": 1587 }, { "epoch": 13.572649572649572, "grad_norm": 4.723629951477051, "learning_rate": 3.213675213675214e-06, "loss": 0.142, "step": 1588 }, { "epoch": 13.581196581196581, "grad_norm": 2.557123899459839, "learning_rate": 3.20940170940171e-06, "loss": 0.1506, "step": 1589 }, { "epoch": 13.58974358974359, "grad_norm": 2.3154499530792236, "learning_rate": 3.205128205128206e-06, "loss": 0.1039, "step": 1590 }, { "epoch": 13.598290598290598, "grad_norm": 1.5464012622833252, "learning_rate": 3.200854700854701e-06, "loss": 0.0989, "step": 1591 }, { "epoch": 13.606837606837606, "grad_norm": 1.5885653495788574, "learning_rate": 3.1965811965811967e-06, "loss": 0.0278, "step": 1592 }, { "epoch": 13.615384615384615, "grad_norm": 2.7710390090942383, "learning_rate": 3.192307692307692e-06, "loss": 0.0521, "step": 1593 }, { "epoch": 13.623931623931623, "grad_norm": 4.587305545806885, "learning_rate": 3.188034188034188e-06, "loss": 0.2609, "step": 1594 }, { "epoch": 13.632478632478632, "grad_norm": 4.343963623046875, "learning_rate": 3.183760683760684e-06, "loss": 0.1079, "step": 1595 }, { "epoch": 13.64102564102564, "grad_norm": 2.7653536796569824, "learning_rate": 3.1794871794871795e-06, "loss": 0.1293, "step": 1596 }, { "epoch": 13.649572649572649, "grad_norm": 3.1731350421905518, "learning_rate": 3.1752136752136753e-06, "loss": 0.1279, "step": 1597 }, { "epoch": 13.658119658119658, "grad_norm": 8.032745361328125, "learning_rate": 3.1709401709401712e-06, "loss": 0.2114, "step": 1598 }, { "epoch": 13.666666666666666, "grad_norm": 5.6177263259887695, "learning_rate": 3.1666666666666667e-06, "loss": 0.0926, "step": 1599 }, { "epoch": 13.675213675213675, "grad_norm": 3.3568480014801025, "learning_rate": 3.1623931623931626e-06, "loss": 0.1299, "step": 1600 }, { "epoch": 13.683760683760683, "grad_norm": 5.182860374450684, "learning_rate": 3.158119658119658e-06, "loss": 0.1688, "step": 1601 }, { "epoch": 13.692307692307692, "grad_norm": 5.954287052154541, "learning_rate": 3.153846153846154e-06, "loss": 0.2634, "step": 1602 }, { "epoch": 13.7008547008547, "grad_norm": 2.8563358783721924, "learning_rate": 3.14957264957265e-06, "loss": 0.0469, "step": 1603 }, { "epoch": 13.709401709401709, "grad_norm": 1.6049034595489502, "learning_rate": 3.1452991452991453e-06, "loss": 0.0855, "step": 1604 }, { "epoch": 13.717948717948717, "grad_norm": 1.9734570980072021, "learning_rate": 3.141025641025641e-06, "loss": 0.0554, "step": 1605 }, { "epoch": 13.726495726495726, "grad_norm": 1.8398605585098267, "learning_rate": 3.136752136752137e-06, "loss": 0.1033, "step": 1606 }, { "epoch": 13.735042735042736, "grad_norm": 3.3013346195220947, "learning_rate": 3.1324786324786326e-06, "loss": 0.1476, "step": 1607 }, { "epoch": 13.743589743589745, "grad_norm": 1.2622041702270508, "learning_rate": 3.1282051282051284e-06, "loss": 0.0222, "step": 1608 }, { "epoch": 13.752136752136753, "grad_norm": 3.983888626098633, "learning_rate": 3.1239316239316243e-06, "loss": 0.0861, "step": 1609 }, { "epoch": 13.760683760683762, "grad_norm": 2.883335828781128, "learning_rate": 3.11965811965812e-06, "loss": 0.0737, "step": 1610 }, { "epoch": 13.76923076923077, "grad_norm": 0.9045059680938721, "learning_rate": 3.1153846153846157e-06, "loss": 0.0232, "step": 1611 }, { "epoch": 13.777777777777779, "grad_norm": 1.8752232789993286, "learning_rate": 3.1111111111111116e-06, "loss": 0.0602, "step": 1612 }, { "epoch": 13.786324786324787, "grad_norm": 3.088440418243408, "learning_rate": 3.106837606837607e-06, "loss": 0.102, "step": 1613 }, { "epoch": 13.794871794871796, "grad_norm": 4.067224502563477, "learning_rate": 3.102564102564103e-06, "loss": 0.1461, "step": 1614 }, { "epoch": 13.803418803418804, "grad_norm": 6.9123148918151855, "learning_rate": 3.098290598290599e-06, "loss": 0.0752, "step": 1615 }, { "epoch": 13.811965811965813, "grad_norm": 17.15372657775879, "learning_rate": 3.0940170940170943e-06, "loss": 0.5163, "step": 1616 }, { "epoch": 13.820512820512821, "grad_norm": 2.4951720237731934, "learning_rate": 3.08974358974359e-06, "loss": 0.1326, "step": 1617 }, { "epoch": 13.82905982905983, "grad_norm": 2.1316449642181396, "learning_rate": 3.0854700854700857e-06, "loss": 0.0469, "step": 1618 }, { "epoch": 13.837606837606838, "grad_norm": 2.5955941677093506, "learning_rate": 3.0811965811965815e-06, "loss": 0.1056, "step": 1619 }, { "epoch": 13.846153846153847, "grad_norm": 14.360347747802734, "learning_rate": 3.0769230769230774e-06, "loss": 0.4793, "step": 1620 }, { "epoch": 13.854700854700855, "grad_norm": 1.9134567975997925, "learning_rate": 3.072649572649573e-06, "loss": 0.054, "step": 1621 }, { "epoch": 13.863247863247864, "grad_norm": 3.1168692111968994, "learning_rate": 3.068376068376069e-06, "loss": 0.321, "step": 1622 }, { "epoch": 13.871794871794872, "grad_norm": 4.940008163452148, "learning_rate": 3.0641025641025647e-06, "loss": 0.1452, "step": 1623 }, { "epoch": 13.88034188034188, "grad_norm": 3.001660108566284, "learning_rate": 3.05982905982906e-06, "loss": 0.1094, "step": 1624 }, { "epoch": 13.88888888888889, "grad_norm": 1.3110100030899048, "learning_rate": 3.055555555555556e-06, "loss": 0.0305, "step": 1625 }, { "epoch": 13.897435897435898, "grad_norm": 269.3442077636719, "learning_rate": 3.051282051282052e-06, "loss": 0.8319, "step": 1626 }, { "epoch": 13.905982905982906, "grad_norm": 1.5236955881118774, "learning_rate": 3.0470085470085474e-06, "loss": 0.0294, "step": 1627 }, { "epoch": 13.914529914529915, "grad_norm": 1.8342583179473877, "learning_rate": 3.042735042735043e-06, "loss": 0.1122, "step": 1628 }, { "epoch": 13.923076923076923, "grad_norm": 1.7902953624725342, "learning_rate": 3.0384615384615383e-06, "loss": 0.0426, "step": 1629 }, { "epoch": 13.931623931623932, "grad_norm": 1.461769938468933, "learning_rate": 3.0341880341880342e-06, "loss": 0.0326, "step": 1630 }, { "epoch": 13.94017094017094, "grad_norm": 2.2590038776397705, "learning_rate": 3.02991452991453e-06, "loss": 0.067, "step": 1631 }, { "epoch": 13.948717948717949, "grad_norm": 0.8894402980804443, "learning_rate": 3.0256410256410256e-06, "loss": 0.0269, "step": 1632 }, { "epoch": 13.957264957264957, "grad_norm": 2.097757339477539, "learning_rate": 3.0213675213675215e-06, "loss": 0.1211, "step": 1633 }, { "epoch": 13.965811965811966, "grad_norm": 4.112930774688721, "learning_rate": 3.0170940170940174e-06, "loss": 0.1026, "step": 1634 }, { "epoch": 13.974358974358974, "grad_norm": 4.55318021774292, "learning_rate": 3.012820512820513e-06, "loss": 0.2808, "step": 1635 }, { "epoch": 13.982905982905983, "grad_norm": 2.1912014484405518, "learning_rate": 3.0085470085470087e-06, "loss": 0.0906, "step": 1636 }, { "epoch": 13.991452991452991, "grad_norm": 4.612771511077881, "learning_rate": 3.004273504273504e-06, "loss": 0.17, "step": 1637 }, { "epoch": 14.0, "grad_norm": 7.162411212921143, "learning_rate": 3e-06, "loss": 0.131, "step": 1638 }, { "epoch": 14.0, "eval_loss": 0.06268326193094254, "eval_runtime": 9.262, "eval_samples_per_second": 50.313, "eval_steps_per_second": 6.37, "step": 1638 }, { "epoch": 14.008547008547009, "grad_norm": 4.41022253036499, "learning_rate": 2.995726495726496e-06, "loss": 0.1989, "step": 1639 }, { "epoch": 14.017094017094017, "grad_norm": 2.2863216400146484, "learning_rate": 2.9914529914529914e-06, "loss": 0.0612, "step": 1640 }, { "epoch": 14.025641025641026, "grad_norm": 1.5455230474472046, "learning_rate": 2.9871794871794873e-06, "loss": 0.0378, "step": 1641 }, { "epoch": 14.034188034188034, "grad_norm": 0.9546025991439819, "learning_rate": 2.9829059829059832e-06, "loss": 0.0214, "step": 1642 }, { "epoch": 14.042735042735043, "grad_norm": 5.546824932098389, "learning_rate": 2.9786324786324787e-06, "loss": 0.2502, "step": 1643 }, { "epoch": 14.051282051282051, "grad_norm": 1.6261364221572876, "learning_rate": 2.9743589743589746e-06, "loss": 0.0271, "step": 1644 }, { "epoch": 14.05982905982906, "grad_norm": 1.710256814956665, "learning_rate": 2.9700854700854705e-06, "loss": 0.0582, "step": 1645 }, { "epoch": 14.068376068376068, "grad_norm": 1.2083494663238525, "learning_rate": 2.965811965811966e-06, "loss": 0.026, "step": 1646 }, { "epoch": 14.076923076923077, "grad_norm": 3.6400561332702637, "learning_rate": 2.961538461538462e-06, "loss": 0.0896, "step": 1647 }, { "epoch": 14.085470085470085, "grad_norm": 2.1084742546081543, "learning_rate": 2.9572649572649577e-06, "loss": 0.0269, "step": 1648 }, { "epoch": 14.094017094017094, "grad_norm": 1.5661289691925049, "learning_rate": 2.952991452991453e-06, "loss": 0.0401, "step": 1649 }, { "epoch": 14.102564102564102, "grad_norm": 23.358585357666016, "learning_rate": 2.948717948717949e-06, "loss": 0.2069, "step": 1650 }, { "epoch": 14.11111111111111, "grad_norm": 9.171899795532227, "learning_rate": 2.944444444444445e-06, "loss": 0.2842, "step": 1651 }, { "epoch": 14.11965811965812, "grad_norm": 1.3189946413040161, "learning_rate": 2.9401709401709404e-06, "loss": 0.0331, "step": 1652 }, { "epoch": 14.128205128205128, "grad_norm": 3.6144192218780518, "learning_rate": 2.9358974358974363e-06, "loss": 0.2069, "step": 1653 }, { "epoch": 14.136752136752136, "grad_norm": 2.764681577682495, "learning_rate": 2.931623931623932e-06, "loss": 0.0646, "step": 1654 }, { "epoch": 14.145299145299145, "grad_norm": 2.073028564453125, "learning_rate": 2.9273504273504277e-06, "loss": 0.1223, "step": 1655 }, { "epoch": 14.153846153846153, "grad_norm": 12.209549903869629, "learning_rate": 2.9230769230769236e-06, "loss": 0.1922, "step": 1656 }, { "epoch": 14.162393162393162, "grad_norm": 3.1137638092041016, "learning_rate": 2.918803418803419e-06, "loss": 0.2586, "step": 1657 }, { "epoch": 14.17094017094017, "grad_norm": 5.130307674407959, "learning_rate": 2.914529914529915e-06, "loss": 0.2695, "step": 1658 }, { "epoch": 14.179487179487179, "grad_norm": 3.475097894668579, "learning_rate": 2.910256410256411e-06, "loss": 0.2131, "step": 1659 }, { "epoch": 14.188034188034187, "grad_norm": 0.5851498246192932, "learning_rate": 2.9059829059829063e-06, "loss": 0.0167, "step": 1660 }, { "epoch": 14.196581196581196, "grad_norm": 1.795509934425354, "learning_rate": 2.901709401709402e-06, "loss": 0.0857, "step": 1661 }, { "epoch": 14.205128205128204, "grad_norm": 1.7123979330062866, "learning_rate": 2.897435897435898e-06, "loss": 0.0599, "step": 1662 }, { "epoch": 14.213675213675213, "grad_norm": 1.230388879776001, "learning_rate": 2.8931623931623935e-06, "loss": 0.0255, "step": 1663 }, { "epoch": 14.222222222222221, "grad_norm": 3.8747615814208984, "learning_rate": 2.888888888888889e-06, "loss": 0.1412, "step": 1664 }, { "epoch": 14.23076923076923, "grad_norm": 2.233584403991699, "learning_rate": 2.8846153846153845e-06, "loss": 0.068, "step": 1665 }, { "epoch": 14.239316239316238, "grad_norm": 5.327254772186279, "learning_rate": 2.8803418803418804e-06, "loss": 0.2616, "step": 1666 }, { "epoch": 14.247863247863247, "grad_norm": 6.126563549041748, "learning_rate": 2.8760683760683762e-06, "loss": 0.0931, "step": 1667 }, { "epoch": 14.256410256410255, "grad_norm": 1.4305050373077393, "learning_rate": 2.8717948717948717e-06, "loss": 0.0221, "step": 1668 }, { "epoch": 14.264957264957266, "grad_norm": 3.0924506187438965, "learning_rate": 2.8675213675213676e-06, "loss": 0.0417, "step": 1669 }, { "epoch": 14.273504273504274, "grad_norm": 2.548558235168457, "learning_rate": 2.8632478632478635e-06, "loss": 0.0744, "step": 1670 }, { "epoch": 14.282051282051283, "grad_norm": 0.46632057428359985, "learning_rate": 2.858974358974359e-06, "loss": 0.0114, "step": 1671 }, { "epoch": 14.290598290598291, "grad_norm": 2.5199391841888428, "learning_rate": 2.854700854700855e-06, "loss": 0.0819, "step": 1672 }, { "epoch": 14.2991452991453, "grad_norm": 1.849133014678955, "learning_rate": 2.8504273504273507e-06, "loss": 0.0424, "step": 1673 }, { "epoch": 14.307692307692308, "grad_norm": 2.9396777153015137, "learning_rate": 2.846153846153846e-06, "loss": 0.0836, "step": 1674 }, { "epoch": 14.316239316239317, "grad_norm": 0.7128950953483582, "learning_rate": 2.841880341880342e-06, "loss": 0.0181, "step": 1675 }, { "epoch": 14.324786324786325, "grad_norm": 2.1387767791748047, "learning_rate": 2.8376068376068376e-06, "loss": 0.0432, "step": 1676 }, { "epoch": 14.333333333333334, "grad_norm": 7.104556083679199, "learning_rate": 2.8333333333333335e-06, "loss": 0.1277, "step": 1677 }, { "epoch": 14.341880341880342, "grad_norm": 3.718749761581421, "learning_rate": 2.8290598290598293e-06, "loss": 0.0738, "step": 1678 }, { "epoch": 14.350427350427351, "grad_norm": 3.9387831687927246, "learning_rate": 2.824786324786325e-06, "loss": 0.1374, "step": 1679 }, { "epoch": 14.35897435897436, "grad_norm": 2.1527843475341797, "learning_rate": 2.8205128205128207e-06, "loss": 0.1426, "step": 1680 }, { "epoch": 14.367521367521368, "grad_norm": 1.0589011907577515, "learning_rate": 2.8162393162393166e-06, "loss": 0.0343, "step": 1681 }, { "epoch": 14.376068376068377, "grad_norm": 3.55014967918396, "learning_rate": 2.811965811965812e-06, "loss": 0.2962, "step": 1682 }, { "epoch": 14.384615384615385, "grad_norm": 3.996713399887085, "learning_rate": 2.807692307692308e-06, "loss": 0.1458, "step": 1683 }, { "epoch": 14.393162393162394, "grad_norm": 73.28384399414062, "learning_rate": 2.803418803418804e-06, "loss": 0.6138, "step": 1684 }, { "epoch": 14.401709401709402, "grad_norm": 5.780628681182861, "learning_rate": 2.7991452991452993e-06, "loss": 0.2619, "step": 1685 }, { "epoch": 14.41025641025641, "grad_norm": 3.2047317028045654, "learning_rate": 2.794871794871795e-06, "loss": 0.1917, "step": 1686 }, { "epoch": 14.418803418803419, "grad_norm": 7.041647434234619, "learning_rate": 2.790598290598291e-06, "loss": 0.2136, "step": 1687 }, { "epoch": 14.427350427350428, "grad_norm": 3.391404867172241, "learning_rate": 2.7863247863247866e-06, "loss": 0.094, "step": 1688 }, { "epoch": 14.435897435897436, "grad_norm": 0.5430964231491089, "learning_rate": 2.7820512820512824e-06, "loss": 0.0139, "step": 1689 }, { "epoch": 14.444444444444445, "grad_norm": 5.696547985076904, "learning_rate": 2.7777777777777783e-06, "loss": 0.5808, "step": 1690 }, { "epoch": 14.452991452991453, "grad_norm": 3.5785481929779053, "learning_rate": 2.773504273504274e-06, "loss": 0.219, "step": 1691 }, { "epoch": 14.461538461538462, "grad_norm": 6.63624906539917, "learning_rate": 2.7692307692307697e-06, "loss": 0.2586, "step": 1692 }, { "epoch": 14.47008547008547, "grad_norm": 16.79705810546875, "learning_rate": 2.764957264957265e-06, "loss": 0.1762, "step": 1693 }, { "epoch": 14.478632478632479, "grad_norm": 4.069973468780518, "learning_rate": 2.760683760683761e-06, "loss": 0.1191, "step": 1694 }, { "epoch": 14.487179487179487, "grad_norm": 1.1191340684890747, "learning_rate": 2.756410256410257e-06, "loss": 0.0529, "step": 1695 }, { "epoch": 14.495726495726496, "grad_norm": 2.23835825920105, "learning_rate": 2.7521367521367524e-06, "loss": 0.0681, "step": 1696 }, { "epoch": 14.504273504273504, "grad_norm": 2.745694160461426, "learning_rate": 2.7478632478632483e-06, "loss": 0.1885, "step": 1697 }, { "epoch": 14.512820512820513, "grad_norm": 3.642946720123291, "learning_rate": 2.743589743589744e-06, "loss": 0.2061, "step": 1698 }, { "epoch": 14.521367521367521, "grad_norm": 2.7571651935577393, "learning_rate": 2.7393162393162397e-06, "loss": 0.074, "step": 1699 }, { "epoch": 14.52991452991453, "grad_norm": 0.889057457447052, "learning_rate": 2.7350427350427355e-06, "loss": 0.0342, "step": 1700 }, { "epoch": 14.538461538461538, "grad_norm": 0.5471668243408203, "learning_rate": 2.7307692307692306e-06, "loss": 0.0125, "step": 1701 }, { "epoch": 14.547008547008547, "grad_norm": 6.883024215698242, "learning_rate": 2.7264957264957265e-06, "loss": 0.4102, "step": 1702 }, { "epoch": 14.555555555555555, "grad_norm": 2.6678171157836914, "learning_rate": 2.7222222222222224e-06, "loss": 0.0872, "step": 1703 }, { "epoch": 14.564102564102564, "grad_norm": 5.825995445251465, "learning_rate": 2.717948717948718e-06, "loss": 0.1081, "step": 1704 }, { "epoch": 14.572649572649572, "grad_norm": 1.5447179079055786, "learning_rate": 2.7136752136752137e-06, "loss": 0.0838, "step": 1705 }, { "epoch": 14.581196581196581, "grad_norm": 17.58099937438965, "learning_rate": 2.7094017094017096e-06, "loss": 0.6379, "step": 1706 }, { "epoch": 14.58974358974359, "grad_norm": 0.9537908434867859, "learning_rate": 2.705128205128205e-06, "loss": 0.0221, "step": 1707 }, { "epoch": 14.598290598290598, "grad_norm": 3.264037847518921, "learning_rate": 2.700854700854701e-06, "loss": 0.1282, "step": 1708 }, { "epoch": 14.606837606837606, "grad_norm": 1.7752703428268433, "learning_rate": 2.696581196581197e-06, "loss": 0.0194, "step": 1709 }, { "epoch": 14.615384615384615, "grad_norm": 4.8417649269104, "learning_rate": 2.6923076923076923e-06, "loss": 0.2217, "step": 1710 }, { "epoch": 14.623931623931623, "grad_norm": 2.915694236755371, "learning_rate": 2.6880341880341882e-06, "loss": 0.1506, "step": 1711 }, { "epoch": 14.632478632478632, "grad_norm": 10.983115196228027, "learning_rate": 2.6837606837606837e-06, "loss": 0.4307, "step": 1712 }, { "epoch": 14.64102564102564, "grad_norm": 1.1121952533721924, "learning_rate": 2.6794871794871796e-06, "loss": 0.0211, "step": 1713 }, { "epoch": 14.649572649572649, "grad_norm": 2.6676313877105713, "learning_rate": 2.6752136752136755e-06, "loss": 0.0997, "step": 1714 }, { "epoch": 14.658119658119658, "grad_norm": 1.718767523765564, "learning_rate": 2.670940170940171e-06, "loss": 0.0533, "step": 1715 }, { "epoch": 14.666666666666666, "grad_norm": 1.567866563796997, "learning_rate": 2.666666666666667e-06, "loss": 0.0913, "step": 1716 }, { "epoch": 14.675213675213675, "grad_norm": 3.0697431564331055, "learning_rate": 2.6623931623931627e-06, "loss": 0.1133, "step": 1717 }, { "epoch": 14.683760683760683, "grad_norm": 2.2237489223480225, "learning_rate": 2.658119658119658e-06, "loss": 0.1091, "step": 1718 }, { "epoch": 14.692307692307692, "grad_norm": 6.050041198730469, "learning_rate": 2.653846153846154e-06, "loss": 0.5622, "step": 1719 }, { "epoch": 14.7008547008547, "grad_norm": 1.1796153783798218, "learning_rate": 2.64957264957265e-06, "loss": 0.0522, "step": 1720 }, { "epoch": 14.709401709401709, "grad_norm": 2.4849863052368164, "learning_rate": 2.6452991452991454e-06, "loss": 0.0332, "step": 1721 }, { "epoch": 14.717948717948717, "grad_norm": 1.771933674812317, "learning_rate": 2.6410256410256413e-06, "loss": 0.0692, "step": 1722 }, { "epoch": 14.726495726495726, "grad_norm": 4.174441337585449, "learning_rate": 2.6367521367521372e-06, "loss": 0.1419, "step": 1723 }, { "epoch": 14.735042735042736, "grad_norm": 4.145920276641846, "learning_rate": 2.6324786324786327e-06, "loss": 0.5196, "step": 1724 }, { "epoch": 14.743589743589745, "grad_norm": 3.363537073135376, "learning_rate": 2.6282051282051286e-06, "loss": 0.1187, "step": 1725 }, { "epoch": 14.752136752136753, "grad_norm": 1.9558751583099365, "learning_rate": 2.6239316239316245e-06, "loss": 0.0193, "step": 1726 }, { "epoch": 14.760683760683762, "grad_norm": 2.8293466567993164, "learning_rate": 2.61965811965812e-06, "loss": 0.0551, "step": 1727 }, { "epoch": 14.76923076923077, "grad_norm": 1.2654905319213867, "learning_rate": 2.615384615384616e-06, "loss": 0.0805, "step": 1728 }, { "epoch": 14.777777777777779, "grad_norm": 0.9344054460525513, "learning_rate": 2.6111111111111113e-06, "loss": 0.0177, "step": 1729 }, { "epoch": 14.786324786324787, "grad_norm": 1.268433690071106, "learning_rate": 2.606837606837607e-06, "loss": 0.0185, "step": 1730 }, { "epoch": 14.794871794871796, "grad_norm": 2.5544192790985107, "learning_rate": 2.602564102564103e-06, "loss": 0.063, "step": 1731 }, { "epoch": 14.803418803418804, "grad_norm": 2.1078386306762695, "learning_rate": 2.5982905982905985e-06, "loss": 0.1203, "step": 1732 }, { "epoch": 14.811965811965813, "grad_norm": 1.526848554611206, "learning_rate": 2.5940170940170944e-06, "loss": 0.0524, "step": 1733 }, { "epoch": 14.820512820512821, "grad_norm": 0.7479220628738403, "learning_rate": 2.5897435897435903e-06, "loss": 0.0197, "step": 1734 }, { "epoch": 14.82905982905983, "grad_norm": 2.937556266784668, "learning_rate": 2.5854700854700858e-06, "loss": 0.1406, "step": 1735 }, { "epoch": 14.837606837606838, "grad_norm": 2.3128576278686523, "learning_rate": 2.5811965811965817e-06, "loss": 0.056, "step": 1736 }, { "epoch": 14.846153846153847, "grad_norm": 2.1093039512634277, "learning_rate": 2.5769230769230767e-06, "loss": 0.0645, "step": 1737 }, { "epoch": 14.854700854700855, "grad_norm": 2.104214668273926, "learning_rate": 2.5726495726495726e-06, "loss": 0.1097, "step": 1738 }, { "epoch": 14.863247863247864, "grad_norm": 3.781390428543091, "learning_rate": 2.5683760683760685e-06, "loss": 0.1214, "step": 1739 }, { "epoch": 14.871794871794872, "grad_norm": 4.119661331176758, "learning_rate": 2.564102564102564e-06, "loss": 0.1797, "step": 1740 }, { "epoch": 14.88034188034188, "grad_norm": 6.488205909729004, "learning_rate": 2.55982905982906e-06, "loss": 0.0679, "step": 1741 }, { "epoch": 14.88888888888889, "grad_norm": 1.4211604595184326, "learning_rate": 2.5555555555555557e-06, "loss": 0.0375, "step": 1742 }, { "epoch": 14.897435897435898, "grad_norm": 3.577533721923828, "learning_rate": 2.5512820512820512e-06, "loss": 0.1914, "step": 1743 }, { "epoch": 14.905982905982906, "grad_norm": 8.697205543518066, "learning_rate": 2.547008547008547e-06, "loss": 0.5511, "step": 1744 }, { "epoch": 14.914529914529915, "grad_norm": 0.49716269969940186, "learning_rate": 2.542735042735043e-06, "loss": 0.0125, "step": 1745 }, { "epoch": 14.923076923076923, "grad_norm": 2.8563008308410645, "learning_rate": 2.5384615384615385e-06, "loss": 0.0901, "step": 1746 }, { "epoch": 14.931623931623932, "grad_norm": 3.6407926082611084, "learning_rate": 2.5341880341880344e-06, "loss": 0.0718, "step": 1747 }, { "epoch": 14.94017094017094, "grad_norm": 1.2601441144943237, "learning_rate": 2.52991452991453e-06, "loss": 0.0451, "step": 1748 }, { "epoch": 14.948717948717949, "grad_norm": 2.4402401447296143, "learning_rate": 2.5256410256410257e-06, "loss": 0.0771, "step": 1749 }, { "epoch": 14.957264957264957, "grad_norm": 0.6150484681129456, "learning_rate": 2.5213675213675216e-06, "loss": 0.0151, "step": 1750 }, { "epoch": 14.965811965811966, "grad_norm": 3.6569836139678955, "learning_rate": 2.517094017094017e-06, "loss": 0.0905, "step": 1751 }, { "epoch": 14.974358974358974, "grad_norm": 3.4421300888061523, "learning_rate": 2.512820512820513e-06, "loss": 0.0456, "step": 1752 }, { "epoch": 14.982905982905983, "grad_norm": 3.565871477127075, "learning_rate": 2.508547008547009e-06, "loss": 0.0491, "step": 1753 }, { "epoch": 14.991452991452991, "grad_norm": 37.519065856933594, "learning_rate": 2.5042735042735043e-06, "loss": 0.1348, "step": 1754 }, { "epoch": 15.0, "grad_norm": 5.1902899742126465, "learning_rate": 2.5e-06, "loss": 0.1099, "step": 1755 }, { "epoch": 15.0, "eval_loss": 0.05930963531136513, "eval_runtime": 9.2206, "eval_samples_per_second": 50.539, "eval_steps_per_second": 6.399, "step": 1755 }, { "epoch": 15.008547008547009, "grad_norm": 5.6569342613220215, "learning_rate": 2.495726495726496e-06, "loss": 0.1931, "step": 1756 }, { "epoch": 15.017094017094017, "grad_norm": 5.23728084564209, "learning_rate": 2.4914529914529916e-06, "loss": 0.2789, "step": 1757 }, { "epoch": 15.025641025641026, "grad_norm": 0.8648807406425476, "learning_rate": 2.4871794871794875e-06, "loss": 0.0227, "step": 1758 }, { "epoch": 15.034188034188034, "grad_norm": 3.0654587745666504, "learning_rate": 2.4829059829059833e-06, "loss": 0.0602, "step": 1759 }, { "epoch": 15.042735042735043, "grad_norm": 4.374608039855957, "learning_rate": 2.478632478632479e-06, "loss": 0.2133, "step": 1760 }, { "epoch": 15.051282051282051, "grad_norm": 1.2764301300048828, "learning_rate": 2.4743589743589747e-06, "loss": 0.0296, "step": 1761 }, { "epoch": 15.05982905982906, "grad_norm": 0.9672349095344543, "learning_rate": 2.4700854700854706e-06, "loss": 0.0224, "step": 1762 }, { "epoch": 15.068376068376068, "grad_norm": 8.807465553283691, "learning_rate": 2.465811965811966e-06, "loss": 0.0925, "step": 1763 }, { "epoch": 15.076923076923077, "grad_norm": 1.4733474254608154, "learning_rate": 2.461538461538462e-06, "loss": 0.0286, "step": 1764 }, { "epoch": 15.085470085470085, "grad_norm": 6.014289855957031, "learning_rate": 2.4572649572649574e-06, "loss": 0.1387, "step": 1765 }, { "epoch": 15.094017094017094, "grad_norm": 1.899086356163025, "learning_rate": 2.452991452991453e-06, "loss": 0.07, "step": 1766 }, { "epoch": 15.102564102564102, "grad_norm": 11.32197380065918, "learning_rate": 2.4487179487179488e-06, "loss": 0.2452, "step": 1767 }, { "epoch": 15.11111111111111, "grad_norm": 3.223996639251709, "learning_rate": 2.4444444444444447e-06, "loss": 0.139, "step": 1768 }, { "epoch": 15.11965811965812, "grad_norm": 2.8729913234710693, "learning_rate": 2.44017094017094e-06, "loss": 0.1386, "step": 1769 }, { "epoch": 15.128205128205128, "grad_norm": 1.9730579853057861, "learning_rate": 2.435897435897436e-06, "loss": 0.0882, "step": 1770 }, { "epoch": 15.136752136752136, "grad_norm": 5.556413650512695, "learning_rate": 2.431623931623932e-06, "loss": 0.1554, "step": 1771 }, { "epoch": 15.145299145299145, "grad_norm": 1.2356898784637451, "learning_rate": 2.4273504273504274e-06, "loss": 0.0217, "step": 1772 }, { "epoch": 15.153846153846153, "grad_norm": 7.849127769470215, "learning_rate": 2.4230769230769233e-06, "loss": 0.221, "step": 1773 }, { "epoch": 15.162393162393162, "grad_norm": 0.5792569518089294, "learning_rate": 2.418803418803419e-06, "loss": 0.017, "step": 1774 }, { "epoch": 15.17094017094017, "grad_norm": 2.2549376487731934, "learning_rate": 2.4145299145299146e-06, "loss": 0.0499, "step": 1775 }, { "epoch": 15.179487179487179, "grad_norm": 2.722200870513916, "learning_rate": 2.4102564102564105e-06, "loss": 0.0408, "step": 1776 }, { "epoch": 15.188034188034187, "grad_norm": 3.1140944957733154, "learning_rate": 2.4059829059829064e-06, "loss": 0.1001, "step": 1777 }, { "epoch": 15.196581196581196, "grad_norm": 4.461791515350342, "learning_rate": 2.401709401709402e-06, "loss": 0.3419, "step": 1778 }, { "epoch": 15.205128205128204, "grad_norm": 1.8562372922897339, "learning_rate": 2.3974358974358978e-06, "loss": 0.1092, "step": 1779 }, { "epoch": 15.213675213675213, "grad_norm": 5.2086181640625, "learning_rate": 2.3931623931623937e-06, "loss": 0.1767, "step": 1780 }, { "epoch": 15.222222222222221, "grad_norm": 1.6226582527160645, "learning_rate": 2.388888888888889e-06, "loss": 0.0347, "step": 1781 }, { "epoch": 15.23076923076923, "grad_norm": 2.8507306575775146, "learning_rate": 2.384615384615385e-06, "loss": 0.0934, "step": 1782 }, { "epoch": 15.239316239316238, "grad_norm": 2.74642276763916, "learning_rate": 2.3803418803418805e-06, "loss": 0.0857, "step": 1783 }, { "epoch": 15.247863247863247, "grad_norm": 3.4352660179138184, "learning_rate": 2.376068376068376e-06, "loss": 0.2336, "step": 1784 }, { "epoch": 15.256410256410255, "grad_norm": 3.4673473834991455, "learning_rate": 2.371794871794872e-06, "loss": 0.1974, "step": 1785 }, { "epoch": 15.264957264957266, "grad_norm": 21.467744827270508, "learning_rate": 2.3675213675213677e-06, "loss": 0.6836, "step": 1786 }, { "epoch": 15.273504273504274, "grad_norm": 2.832465887069702, "learning_rate": 2.363247863247863e-06, "loss": 0.245, "step": 1787 }, { "epoch": 15.282051282051283, "grad_norm": 9.717825889587402, "learning_rate": 2.358974358974359e-06, "loss": 0.5324, "step": 1788 }, { "epoch": 15.290598290598291, "grad_norm": 2.209528923034668, "learning_rate": 2.354700854700855e-06, "loss": 0.0854, "step": 1789 }, { "epoch": 15.2991452991453, "grad_norm": 4.554971218109131, "learning_rate": 2.3504273504273504e-06, "loss": 0.1271, "step": 1790 }, { "epoch": 15.307692307692308, "grad_norm": 3.1280457973480225, "learning_rate": 2.3461538461538463e-06, "loss": 0.1265, "step": 1791 }, { "epoch": 15.316239316239317, "grad_norm": 2.647224187850952, "learning_rate": 2.3418803418803422e-06, "loss": 0.1965, "step": 1792 }, { "epoch": 15.324786324786325, "grad_norm": 2.7695155143737793, "learning_rate": 2.3376068376068377e-06, "loss": 0.0528, "step": 1793 }, { "epoch": 15.333333333333334, "grad_norm": 20.151025772094727, "learning_rate": 2.3333333333333336e-06, "loss": 0.2011, "step": 1794 }, { "epoch": 15.341880341880342, "grad_norm": 2.8718080520629883, "learning_rate": 2.3290598290598295e-06, "loss": 0.0502, "step": 1795 }, { "epoch": 15.350427350427351, "grad_norm": 2.17462158203125, "learning_rate": 2.324786324786325e-06, "loss": 0.0658, "step": 1796 }, { "epoch": 15.35897435897436, "grad_norm": 4.324810981750488, "learning_rate": 2.320512820512821e-06, "loss": 0.1429, "step": 1797 }, { "epoch": 15.367521367521368, "grad_norm": 184.52798461914062, "learning_rate": 2.3162393162393167e-06, "loss": 0.5155, "step": 1798 }, { "epoch": 15.376068376068377, "grad_norm": 2.6076488494873047, "learning_rate": 2.311965811965812e-06, "loss": 0.0708, "step": 1799 }, { "epoch": 15.384615384615385, "grad_norm": 3.0682790279388428, "learning_rate": 2.307692307692308e-06, "loss": 0.2662, "step": 1800 }, { "epoch": 15.393162393162394, "grad_norm": 1.3366855382919312, "learning_rate": 2.3034188034188035e-06, "loss": 0.0136, "step": 1801 }, { "epoch": 15.401709401709402, "grad_norm": 0.5489670634269714, "learning_rate": 2.299145299145299e-06, "loss": 0.0148, "step": 1802 }, { "epoch": 15.41025641025641, "grad_norm": 1.080804705619812, "learning_rate": 2.294871794871795e-06, "loss": 0.025, "step": 1803 }, { "epoch": 15.418803418803419, "grad_norm": 8.801629066467285, "learning_rate": 2.290598290598291e-06, "loss": 0.2038, "step": 1804 }, { "epoch": 15.427350427350428, "grad_norm": 66.96419525146484, "learning_rate": 2.2863247863247863e-06, "loss": 0.4094, "step": 1805 }, { "epoch": 15.435897435897436, "grad_norm": 1.3400782346725464, "learning_rate": 2.282051282051282e-06, "loss": 0.0452, "step": 1806 }, { "epoch": 15.444444444444445, "grad_norm": 3.5850300788879395, "learning_rate": 2.277777777777778e-06, "loss": 0.0919, "step": 1807 }, { "epoch": 15.452991452991453, "grad_norm": 8.670539855957031, "learning_rate": 2.2735042735042735e-06, "loss": 0.255, "step": 1808 }, { "epoch": 15.461538461538462, "grad_norm": 3.609617233276367, "learning_rate": 2.2692307692307694e-06, "loss": 0.1203, "step": 1809 }, { "epoch": 15.47008547008547, "grad_norm": 1.5857924222946167, "learning_rate": 2.2649572649572653e-06, "loss": 0.0371, "step": 1810 }, { "epoch": 15.478632478632479, "grad_norm": 1.386805534362793, "learning_rate": 2.2606837606837608e-06, "loss": 0.0385, "step": 1811 }, { "epoch": 15.487179487179487, "grad_norm": 4.130802631378174, "learning_rate": 2.2564102564102566e-06, "loss": 0.2261, "step": 1812 }, { "epoch": 15.495726495726496, "grad_norm": 2.974247455596924, "learning_rate": 2.2521367521367525e-06, "loss": 0.0651, "step": 1813 }, { "epoch": 15.504273504273504, "grad_norm": 1.2551554441452026, "learning_rate": 2.247863247863248e-06, "loss": 0.0229, "step": 1814 }, { "epoch": 15.512820512820513, "grad_norm": 3.1401453018188477, "learning_rate": 2.243589743589744e-06, "loss": 0.0409, "step": 1815 }, { "epoch": 15.521367521367521, "grad_norm": 1.3921948671340942, "learning_rate": 2.2393162393162398e-06, "loss": 0.0335, "step": 1816 }, { "epoch": 15.52991452991453, "grad_norm": 5.457981586456299, "learning_rate": 2.2350427350427353e-06, "loss": 0.22, "step": 1817 }, { "epoch": 15.538461538461538, "grad_norm": 0.9100427031517029, "learning_rate": 2.230769230769231e-06, "loss": 0.0217, "step": 1818 }, { "epoch": 15.547008547008547, "grad_norm": 3.5890519618988037, "learning_rate": 2.2264957264957266e-06, "loss": 0.2241, "step": 1819 }, { "epoch": 15.555555555555555, "grad_norm": 2.965954303741455, "learning_rate": 2.222222222222222e-06, "loss": 0.1453, "step": 1820 }, { "epoch": 15.564102564102564, "grad_norm": 8.436135292053223, "learning_rate": 2.217948717948718e-06, "loss": 0.2784, "step": 1821 }, { "epoch": 15.572649572649572, "grad_norm": 2.043687582015991, "learning_rate": 2.213675213675214e-06, "loss": 0.0755, "step": 1822 }, { "epoch": 15.581196581196581, "grad_norm": 2.380276918411255, "learning_rate": 2.2094017094017093e-06, "loss": 0.1867, "step": 1823 }, { "epoch": 15.58974358974359, "grad_norm": 2.5189390182495117, "learning_rate": 2.2051282051282052e-06, "loss": 0.0619, "step": 1824 }, { "epoch": 15.598290598290598, "grad_norm": 1.123610258102417, "learning_rate": 2.200854700854701e-06, "loss": 0.0286, "step": 1825 }, { "epoch": 15.606837606837606, "grad_norm": 3.0018534660339355, "learning_rate": 2.1965811965811966e-06, "loss": 0.1449, "step": 1826 }, { "epoch": 15.615384615384615, "grad_norm": 2.178926706314087, "learning_rate": 2.1923076923076925e-06, "loss": 0.0859, "step": 1827 }, { "epoch": 15.623931623931623, "grad_norm": 5.799438953399658, "learning_rate": 2.1880341880341884e-06, "loss": 0.2669, "step": 1828 }, { "epoch": 15.632478632478632, "grad_norm": 2.0338144302368164, "learning_rate": 2.183760683760684e-06, "loss": 0.0616, "step": 1829 }, { "epoch": 15.64102564102564, "grad_norm": 3.789525032043457, "learning_rate": 2.1794871794871797e-06, "loss": 0.0439, "step": 1830 }, { "epoch": 15.649572649572649, "grad_norm": 2.3695919513702393, "learning_rate": 2.1752136752136756e-06, "loss": 0.0979, "step": 1831 }, { "epoch": 15.658119658119658, "grad_norm": 0.8543546795845032, "learning_rate": 2.170940170940171e-06, "loss": 0.0171, "step": 1832 }, { "epoch": 15.666666666666666, "grad_norm": 3.7921054363250732, "learning_rate": 2.166666666666667e-06, "loss": 0.1094, "step": 1833 }, { "epoch": 15.675213675213675, "grad_norm": 1.9967904090881348, "learning_rate": 2.162393162393163e-06, "loss": 0.0382, "step": 1834 }, { "epoch": 15.683760683760683, "grad_norm": 2.5073959827423096, "learning_rate": 2.1581196581196583e-06, "loss": 0.0554, "step": 1835 }, { "epoch": 15.692307692307692, "grad_norm": 1.2741888761520386, "learning_rate": 2.153846153846154e-06, "loss": 0.056, "step": 1836 }, { "epoch": 15.7008547008547, "grad_norm": 1.992280125617981, "learning_rate": 2.1495726495726497e-06, "loss": 0.0206, "step": 1837 }, { "epoch": 15.709401709401709, "grad_norm": 1.0176990032196045, "learning_rate": 2.145299145299145e-06, "loss": 0.0276, "step": 1838 }, { "epoch": 15.717948717948717, "grad_norm": 1.6685941219329834, "learning_rate": 2.141025641025641e-06, "loss": 0.0222, "step": 1839 }, { "epoch": 15.726495726495726, "grad_norm": 3.171050548553467, "learning_rate": 2.136752136752137e-06, "loss": 0.1526, "step": 1840 }, { "epoch": 15.735042735042736, "grad_norm": 1.5068336725234985, "learning_rate": 2.1324786324786324e-06, "loss": 0.0271, "step": 1841 }, { "epoch": 15.743589743589745, "grad_norm": 3.171870708465576, "learning_rate": 2.1282051282051283e-06, "loss": 0.0628, "step": 1842 }, { "epoch": 15.752136752136753, "grad_norm": 1.9212791919708252, "learning_rate": 2.123931623931624e-06, "loss": 0.1018, "step": 1843 }, { "epoch": 15.760683760683762, "grad_norm": 4.073456287384033, "learning_rate": 2.1196581196581196e-06, "loss": 0.1144, "step": 1844 }, { "epoch": 15.76923076923077, "grad_norm": 1.8453985452651978, "learning_rate": 2.1153846153846155e-06, "loss": 0.0995, "step": 1845 }, { "epoch": 15.777777777777779, "grad_norm": 3.285759210586548, "learning_rate": 2.1111111111111114e-06, "loss": 0.1173, "step": 1846 }, { "epoch": 15.786324786324787, "grad_norm": 3.709202289581299, "learning_rate": 2.106837606837607e-06, "loss": 0.1906, "step": 1847 }, { "epoch": 15.794871794871796, "grad_norm": 1.951262354850769, "learning_rate": 2.1025641025641028e-06, "loss": 0.0954, "step": 1848 }, { "epoch": 15.803418803418804, "grad_norm": 3.249171257019043, "learning_rate": 2.0982905982905987e-06, "loss": 0.1258, "step": 1849 }, { "epoch": 15.811965811965813, "grad_norm": 0.5708752274513245, "learning_rate": 2.094017094017094e-06, "loss": 0.0128, "step": 1850 }, { "epoch": 15.820512820512821, "grad_norm": 3.2894484996795654, "learning_rate": 2.08974358974359e-06, "loss": 0.0621, "step": 1851 }, { "epoch": 15.82905982905983, "grad_norm": 0.8564540147781372, "learning_rate": 2.085470085470086e-06, "loss": 0.0194, "step": 1852 }, { "epoch": 15.837606837606838, "grad_norm": 3.319011926651001, "learning_rate": 2.0811965811965814e-06, "loss": 0.1413, "step": 1853 }, { "epoch": 15.846153846153847, "grad_norm": 1.5385066270828247, "learning_rate": 2.0769230769230773e-06, "loss": 0.0316, "step": 1854 }, { "epoch": 15.854700854700855, "grad_norm": 4.076297283172607, "learning_rate": 2.072649572649573e-06, "loss": 0.2257, "step": 1855 }, { "epoch": 15.863247863247864, "grad_norm": 4.738671779632568, "learning_rate": 2.068376068376068e-06, "loss": 0.1627, "step": 1856 }, { "epoch": 15.871794871794872, "grad_norm": 5.589550495147705, "learning_rate": 2.064102564102564e-06, "loss": 0.3182, "step": 1857 }, { "epoch": 15.88034188034188, "grad_norm": 1.6303757429122925, "learning_rate": 2.05982905982906e-06, "loss": 0.0384, "step": 1858 }, { "epoch": 15.88888888888889, "grad_norm": 3.0257458686828613, "learning_rate": 2.0555555555555555e-06, "loss": 0.0967, "step": 1859 }, { "epoch": 15.897435897435898, "grad_norm": 2.4926559925079346, "learning_rate": 2.0512820512820513e-06, "loss": 0.0703, "step": 1860 }, { "epoch": 15.905982905982906, "grad_norm": 2.0784358978271484, "learning_rate": 2.0470085470085472e-06, "loss": 0.062, "step": 1861 }, { "epoch": 15.914529914529915, "grad_norm": 4.92131233215332, "learning_rate": 2.0427350427350427e-06, "loss": 0.0875, "step": 1862 }, { "epoch": 15.923076923076923, "grad_norm": 2.999511241912842, "learning_rate": 2.0384615384615386e-06, "loss": 0.0388, "step": 1863 }, { "epoch": 15.931623931623932, "grad_norm": 5.770095348358154, "learning_rate": 2.0341880341880345e-06, "loss": 0.1257, "step": 1864 }, { "epoch": 15.94017094017094, "grad_norm": 4.730950832366943, "learning_rate": 2.02991452991453e-06, "loss": 0.2386, "step": 1865 }, { "epoch": 15.948717948717949, "grad_norm": 1.8125661611557007, "learning_rate": 2.025641025641026e-06, "loss": 0.0433, "step": 1866 }, { "epoch": 15.957264957264957, "grad_norm": 5.433501243591309, "learning_rate": 2.0213675213675217e-06, "loss": 0.0536, "step": 1867 }, { "epoch": 15.965811965811966, "grad_norm": 1.2565219402313232, "learning_rate": 2.017094017094017e-06, "loss": 0.0263, "step": 1868 }, { "epoch": 15.974358974358974, "grad_norm": 1.5660192966461182, "learning_rate": 2.012820512820513e-06, "loss": 0.0387, "step": 1869 }, { "epoch": 15.982905982905983, "grad_norm": 5.742929935455322, "learning_rate": 2.008547008547009e-06, "loss": 0.2158, "step": 1870 }, { "epoch": 15.991452991452991, "grad_norm": 3.597506284713745, "learning_rate": 2.0042735042735044e-06, "loss": 0.0962, "step": 1871 }, { "epoch": 16.0, "grad_norm": 1.753219485282898, "learning_rate": 2.0000000000000003e-06, "loss": 0.0193, "step": 1872 }, { "epoch": 16.0, "eval_loss": 0.05589358136057854, "eval_runtime": 9.2203, "eval_samples_per_second": 50.541, "eval_steps_per_second": 6.399, "step": 1872 }, { "epoch": 16.00854700854701, "grad_norm": 9.627431869506836, "learning_rate": 1.9957264957264962e-06, "loss": 0.4748, "step": 1873 }, { "epoch": 16.017094017094017, "grad_norm": 7.770556926727295, "learning_rate": 1.9914529914529917e-06, "loss": 0.2615, "step": 1874 }, { "epoch": 16.025641025641026, "grad_norm": 1.7268822193145752, "learning_rate": 1.987179487179487e-06, "loss": 0.0808, "step": 1875 }, { "epoch": 16.034188034188034, "grad_norm": 1.7209370136260986, "learning_rate": 1.982905982905983e-06, "loss": 0.0575, "step": 1876 }, { "epoch": 16.042735042735043, "grad_norm": 2.6422786712646484, "learning_rate": 1.9786324786324785e-06, "loss": 0.0815, "step": 1877 }, { "epoch": 16.05128205128205, "grad_norm": 0.9057373404502869, "learning_rate": 1.9743589743589744e-06, "loss": 0.0359, "step": 1878 }, { "epoch": 16.05982905982906, "grad_norm": 1.4879076480865479, "learning_rate": 1.9700854700854703e-06, "loss": 0.0658, "step": 1879 }, { "epoch": 16.068376068376068, "grad_norm": 2.1336488723754883, "learning_rate": 1.9658119658119658e-06, "loss": 0.0434, "step": 1880 }, { "epoch": 16.076923076923077, "grad_norm": 2.642249822616577, "learning_rate": 1.9615384615384617e-06, "loss": 0.0768, "step": 1881 }, { "epoch": 16.085470085470085, "grad_norm": 398.1800842285156, "learning_rate": 1.9572649572649575e-06, "loss": 1.7061, "step": 1882 }, { "epoch": 16.094017094017094, "grad_norm": 1.6067556142807007, "learning_rate": 1.952991452991453e-06, "loss": 0.0492, "step": 1883 }, { "epoch": 16.102564102564102, "grad_norm": 45.67499542236328, "learning_rate": 1.948717948717949e-06, "loss": 0.2883, "step": 1884 }, { "epoch": 16.11111111111111, "grad_norm": 5.477624416351318, "learning_rate": 1.944444444444445e-06, "loss": 0.1107, "step": 1885 }, { "epoch": 16.11965811965812, "grad_norm": 2.2795376777648926, "learning_rate": 1.9401709401709403e-06, "loss": 0.0427, "step": 1886 }, { "epoch": 16.128205128205128, "grad_norm": 1.9572805166244507, "learning_rate": 1.935897435897436e-06, "loss": 0.04, "step": 1887 }, { "epoch": 16.136752136752136, "grad_norm": 1.9205402135849, "learning_rate": 1.931623931623932e-06, "loss": 0.0384, "step": 1888 }, { "epoch": 16.145299145299145, "grad_norm": 1.6124738454818726, "learning_rate": 1.9273504273504275e-06, "loss": 0.0322, "step": 1889 }, { "epoch": 16.153846153846153, "grad_norm": 3.3396270275115967, "learning_rate": 1.9230769230769234e-06, "loss": 0.1302, "step": 1890 }, { "epoch": 16.162393162393162, "grad_norm": 2.4800124168395996, "learning_rate": 1.9188034188034193e-06, "loss": 0.1181, "step": 1891 }, { "epoch": 16.17094017094017, "grad_norm": 5.452153205871582, "learning_rate": 1.9145299145299148e-06, "loss": 0.2054, "step": 1892 }, { "epoch": 16.17948717948718, "grad_norm": 4.445066452026367, "learning_rate": 1.9102564102564102e-06, "loss": 0.1649, "step": 1893 }, { "epoch": 16.188034188034187, "grad_norm": 1.0402263402938843, "learning_rate": 1.9059829059829061e-06, "loss": 0.0285, "step": 1894 }, { "epoch": 16.196581196581196, "grad_norm": 1.8124594688415527, "learning_rate": 1.9017094017094018e-06, "loss": 0.0717, "step": 1895 }, { "epoch": 16.205128205128204, "grad_norm": 5.0620245933532715, "learning_rate": 1.8974358974358975e-06, "loss": 0.3833, "step": 1896 }, { "epoch": 16.213675213675213, "grad_norm": 3.201596975326538, "learning_rate": 1.8931623931623931e-06, "loss": 0.0687, "step": 1897 }, { "epoch": 16.22222222222222, "grad_norm": 0.9610732793807983, "learning_rate": 1.888888888888889e-06, "loss": 0.0165, "step": 1898 }, { "epoch": 16.23076923076923, "grad_norm": 1.3409554958343506, "learning_rate": 1.8846153846153847e-06, "loss": 0.024, "step": 1899 }, { "epoch": 16.23931623931624, "grad_norm": 1.2862681150436401, "learning_rate": 1.8803418803418804e-06, "loss": 0.042, "step": 1900 }, { "epoch": 16.247863247863247, "grad_norm": 6.403625011444092, "learning_rate": 1.8760683760683763e-06, "loss": 0.5536, "step": 1901 }, { "epoch": 16.256410256410255, "grad_norm": 3.241731882095337, "learning_rate": 1.871794871794872e-06, "loss": 0.1045, "step": 1902 }, { "epoch": 16.264957264957264, "grad_norm": 1.1206634044647217, "learning_rate": 1.8675213675213676e-06, "loss": 0.0383, "step": 1903 }, { "epoch": 16.273504273504273, "grad_norm": 3.3005762100219727, "learning_rate": 1.8632478632478635e-06, "loss": 0.0786, "step": 1904 }, { "epoch": 16.28205128205128, "grad_norm": 0.44867634773254395, "learning_rate": 1.8589743589743592e-06, "loss": 0.0104, "step": 1905 }, { "epoch": 16.29059829059829, "grad_norm": 2.7023422718048096, "learning_rate": 1.8547008547008549e-06, "loss": 0.1091, "step": 1906 }, { "epoch": 16.299145299145298, "grad_norm": 0.9612734317779541, "learning_rate": 1.8504273504273506e-06, "loss": 0.0165, "step": 1907 }, { "epoch": 16.307692307692307, "grad_norm": 3.0632894039154053, "learning_rate": 1.8461538461538465e-06, "loss": 0.1118, "step": 1908 }, { "epoch": 16.316239316239315, "grad_norm": 3.932769775390625, "learning_rate": 1.8418803418803421e-06, "loss": 0.1084, "step": 1909 }, { "epoch": 16.324786324786324, "grad_norm": 7.795356273651123, "learning_rate": 1.8376068376068378e-06, "loss": 0.2923, "step": 1910 }, { "epoch": 16.333333333333332, "grad_norm": 1.4187766313552856, "learning_rate": 1.8333333333333333e-06, "loss": 0.0408, "step": 1911 }, { "epoch": 16.34188034188034, "grad_norm": 1.1020699739456177, "learning_rate": 1.8290598290598292e-06, "loss": 0.0168, "step": 1912 }, { "epoch": 16.35042735042735, "grad_norm": 0.9890375733375549, "learning_rate": 1.8247863247863249e-06, "loss": 0.0391, "step": 1913 }, { "epoch": 16.358974358974358, "grad_norm": 39.418235778808594, "learning_rate": 1.8205128205128205e-06, "loss": 0.2804, "step": 1914 }, { "epoch": 16.367521367521366, "grad_norm": 1.6613589525222778, "learning_rate": 1.8162393162393164e-06, "loss": 0.0475, "step": 1915 }, { "epoch": 16.376068376068375, "grad_norm": 4.359612464904785, "learning_rate": 1.811965811965812e-06, "loss": 0.2247, "step": 1916 }, { "epoch": 16.384615384615383, "grad_norm": 1.970078706741333, "learning_rate": 1.8076923076923078e-06, "loss": 0.03, "step": 1917 }, { "epoch": 16.39316239316239, "grad_norm": 2.046025037765503, "learning_rate": 1.8034188034188035e-06, "loss": 0.0277, "step": 1918 }, { "epoch": 16.4017094017094, "grad_norm": 1.5775028467178345, "learning_rate": 1.7991452991452994e-06, "loss": 0.0764, "step": 1919 }, { "epoch": 16.41025641025641, "grad_norm": 2.8837273120880127, "learning_rate": 1.794871794871795e-06, "loss": 0.0903, "step": 1920 }, { "epoch": 16.418803418803417, "grad_norm": 7.059972763061523, "learning_rate": 1.7905982905982907e-06, "loss": 0.0679, "step": 1921 }, { "epoch": 16.427350427350426, "grad_norm": 3.6101839542388916, "learning_rate": 1.7863247863247866e-06, "loss": 0.1402, "step": 1922 }, { "epoch": 16.435897435897434, "grad_norm": 2.3459484577178955, "learning_rate": 1.7820512820512823e-06, "loss": 0.0751, "step": 1923 }, { "epoch": 16.444444444444443, "grad_norm": 2.0556280612945557, "learning_rate": 1.777777777777778e-06, "loss": 0.0452, "step": 1924 }, { "epoch": 16.45299145299145, "grad_norm": 0.5339368581771851, "learning_rate": 1.7735042735042736e-06, "loss": 0.013, "step": 1925 }, { "epoch": 16.46153846153846, "grad_norm": 1.393329381942749, "learning_rate": 1.7692307692307695e-06, "loss": 0.038, "step": 1926 }, { "epoch": 16.47008547008547, "grad_norm": 0.9439583420753479, "learning_rate": 1.7649572649572652e-06, "loss": 0.0228, "step": 1927 }, { "epoch": 16.478632478632477, "grad_norm": 3.437713384628296, "learning_rate": 1.7606837606837609e-06, "loss": 0.2072, "step": 1928 }, { "epoch": 16.487179487179485, "grad_norm": 1.725557804107666, "learning_rate": 1.7564102564102563e-06, "loss": 0.0494, "step": 1929 }, { "epoch": 16.495726495726494, "grad_norm": 2.4226529598236084, "learning_rate": 1.7521367521367522e-06, "loss": 0.0796, "step": 1930 }, { "epoch": 16.504273504273506, "grad_norm": 36.0551643371582, "learning_rate": 1.747863247863248e-06, "loss": 0.1966, "step": 1931 }, { "epoch": 16.51282051282051, "grad_norm": 0.8370515704154968, "learning_rate": 1.7435897435897436e-06, "loss": 0.0346, "step": 1932 }, { "epoch": 16.521367521367523, "grad_norm": 2.486854314804077, "learning_rate": 1.7393162393162395e-06, "loss": 0.1423, "step": 1933 }, { "epoch": 16.52991452991453, "grad_norm": 3.2457993030548096, "learning_rate": 1.7350427350427352e-06, "loss": 0.1894, "step": 1934 }, { "epoch": 16.53846153846154, "grad_norm": 2.1744906902313232, "learning_rate": 1.7307692307692308e-06, "loss": 0.0889, "step": 1935 }, { "epoch": 16.54700854700855, "grad_norm": 1.9443250894546509, "learning_rate": 1.7264957264957265e-06, "loss": 0.0413, "step": 1936 }, { "epoch": 16.555555555555557, "grad_norm": 2.0389249324798584, "learning_rate": 1.7222222222222224e-06, "loss": 0.0798, "step": 1937 }, { "epoch": 16.564102564102566, "grad_norm": 4.600223064422607, "learning_rate": 1.717948717948718e-06, "loss": 0.0706, "step": 1938 }, { "epoch": 16.572649572649574, "grad_norm": 1.4231921434402466, "learning_rate": 1.7136752136752138e-06, "loss": 0.0856, "step": 1939 }, { "epoch": 16.581196581196583, "grad_norm": 4.8655290603637695, "learning_rate": 1.7094017094017097e-06, "loss": 0.2519, "step": 1940 }, { "epoch": 16.58974358974359, "grad_norm": 2.6834962368011475, "learning_rate": 1.7051282051282053e-06, "loss": 0.0328, "step": 1941 }, { "epoch": 16.5982905982906, "grad_norm": 0.625557541847229, "learning_rate": 1.700854700854701e-06, "loss": 0.0129, "step": 1942 }, { "epoch": 16.60683760683761, "grad_norm": 10.57834243774414, "learning_rate": 1.6965811965811967e-06, "loss": 0.2987, "step": 1943 }, { "epoch": 16.615384615384617, "grad_norm": 1.2357791662216187, "learning_rate": 1.6923076923076926e-06, "loss": 0.0294, "step": 1944 }, { "epoch": 16.623931623931625, "grad_norm": 1.8380581140518188, "learning_rate": 1.6880341880341883e-06, "loss": 0.0298, "step": 1945 }, { "epoch": 16.632478632478634, "grad_norm": 1.2370020151138306, "learning_rate": 1.683760683760684e-06, "loss": 0.0285, "step": 1946 }, { "epoch": 16.641025641025642, "grad_norm": 5.922267913818359, "learning_rate": 1.6794871794871794e-06, "loss": 0.24, "step": 1947 }, { "epoch": 16.64957264957265, "grad_norm": 2.439023494720459, "learning_rate": 1.6752136752136753e-06, "loss": 0.0988, "step": 1948 }, { "epoch": 16.65811965811966, "grad_norm": 0.8908723592758179, "learning_rate": 1.670940170940171e-06, "loss": 0.026, "step": 1949 }, { "epoch": 16.666666666666668, "grad_norm": 0.8728394508361816, "learning_rate": 1.6666666666666667e-06, "loss": 0.018, "step": 1950 }, { "epoch": 16.675213675213676, "grad_norm": 2.7304019927978516, "learning_rate": 1.6623931623931626e-06, "loss": 0.1567, "step": 1951 }, { "epoch": 16.683760683760685, "grad_norm": 2.8601150512695312, "learning_rate": 1.6581196581196582e-06, "loss": 0.0721, "step": 1952 }, { "epoch": 16.692307692307693, "grad_norm": 2.5990025997161865, "learning_rate": 1.653846153846154e-06, "loss": 0.2296, "step": 1953 }, { "epoch": 16.700854700854702, "grad_norm": 3.7956109046936035, "learning_rate": 1.6495726495726496e-06, "loss": 0.2565, "step": 1954 }, { "epoch": 16.70940170940171, "grad_norm": 5.933072566986084, "learning_rate": 1.6452991452991455e-06, "loss": 0.2712, "step": 1955 }, { "epoch": 16.71794871794872, "grad_norm": 0.5651862621307373, "learning_rate": 1.6410256410256412e-06, "loss": 0.0132, "step": 1956 }, { "epoch": 16.726495726495727, "grad_norm": 3.033231735229492, "learning_rate": 1.6367521367521368e-06, "loss": 0.074, "step": 1957 }, { "epoch": 16.735042735042736, "grad_norm": 1.3515870571136475, "learning_rate": 1.6324786324786327e-06, "loss": 0.0614, "step": 1958 }, { "epoch": 16.743589743589745, "grad_norm": 3.091700792312622, "learning_rate": 1.6282051282051284e-06, "loss": 0.1284, "step": 1959 }, { "epoch": 16.752136752136753, "grad_norm": 7.142216205596924, "learning_rate": 1.623931623931624e-06, "loss": 0.1965, "step": 1960 }, { "epoch": 16.76068376068376, "grad_norm": 7.488593578338623, "learning_rate": 1.6196581196581198e-06, "loss": 0.2498, "step": 1961 }, { "epoch": 16.76923076923077, "grad_norm": 3.943833351135254, "learning_rate": 1.6153846153846157e-06, "loss": 0.0967, "step": 1962 }, { "epoch": 16.77777777777778, "grad_norm": 1.8732318878173828, "learning_rate": 1.6111111111111113e-06, "loss": 0.029, "step": 1963 }, { "epoch": 16.786324786324787, "grad_norm": 2.5445902347564697, "learning_rate": 1.606837606837607e-06, "loss": 0.0808, "step": 1964 }, { "epoch": 16.794871794871796, "grad_norm": 4.969367504119873, "learning_rate": 1.602564102564103e-06, "loss": 0.164, "step": 1965 }, { "epoch": 16.803418803418804, "grad_norm": 1.6954468488693237, "learning_rate": 1.5982905982905984e-06, "loss": 0.0645, "step": 1966 }, { "epoch": 16.811965811965813, "grad_norm": 1.536352276802063, "learning_rate": 1.594017094017094e-06, "loss": 0.0595, "step": 1967 }, { "epoch": 16.82051282051282, "grad_norm": 0.7326592803001404, "learning_rate": 1.5897435897435897e-06, "loss": 0.0153, "step": 1968 }, { "epoch": 16.82905982905983, "grad_norm": 10.959025382995605, "learning_rate": 1.5854700854700856e-06, "loss": 0.3274, "step": 1969 }, { "epoch": 16.837606837606838, "grad_norm": 10.305845260620117, "learning_rate": 1.5811965811965813e-06, "loss": 0.1404, "step": 1970 }, { "epoch": 16.846153846153847, "grad_norm": 7.498697280883789, "learning_rate": 1.576923076923077e-06, "loss": 0.2269, "step": 1971 }, { "epoch": 16.854700854700855, "grad_norm": 0.29253125190734863, "learning_rate": 1.5726495726495727e-06, "loss": 0.0074, "step": 1972 }, { "epoch": 16.863247863247864, "grad_norm": 9.320234298706055, "learning_rate": 1.5683760683760685e-06, "loss": 0.067, "step": 1973 }, { "epoch": 16.871794871794872, "grad_norm": 6.572272300720215, "learning_rate": 1.5641025641025642e-06, "loss": 0.4577, "step": 1974 }, { "epoch": 16.88034188034188, "grad_norm": 5.368937969207764, "learning_rate": 1.55982905982906e-06, "loss": 0.2016, "step": 1975 }, { "epoch": 16.88888888888889, "grad_norm": 0.5891698598861694, "learning_rate": 1.5555555555555558e-06, "loss": 0.0174, "step": 1976 }, { "epoch": 16.897435897435898, "grad_norm": 3.045989751815796, "learning_rate": 1.5512820512820515e-06, "loss": 0.1748, "step": 1977 }, { "epoch": 16.905982905982906, "grad_norm": 3.013834238052368, "learning_rate": 1.5470085470085471e-06, "loss": 0.2283, "step": 1978 }, { "epoch": 16.914529914529915, "grad_norm": 1.2644447088241577, "learning_rate": 1.5427350427350428e-06, "loss": 0.0302, "step": 1979 }, { "epoch": 16.923076923076923, "grad_norm": 4.429958820343018, "learning_rate": 1.5384615384615387e-06, "loss": 0.2458, "step": 1980 }, { "epoch": 16.931623931623932, "grad_norm": 1.1556981801986694, "learning_rate": 1.5341880341880344e-06, "loss": 0.0179, "step": 1981 }, { "epoch": 16.94017094017094, "grad_norm": 1.4588316679000854, "learning_rate": 1.52991452991453e-06, "loss": 0.1063, "step": 1982 }, { "epoch": 16.94871794871795, "grad_norm": 1.124496340751648, "learning_rate": 1.525641025641026e-06, "loss": 0.0278, "step": 1983 }, { "epoch": 16.957264957264957, "grad_norm": 0.7231981754302979, "learning_rate": 1.5213675213675214e-06, "loss": 0.0141, "step": 1984 }, { "epoch": 16.965811965811966, "grad_norm": 1.4819642305374146, "learning_rate": 1.5170940170940171e-06, "loss": 0.0601, "step": 1985 }, { "epoch": 16.974358974358974, "grad_norm": 0.7296791672706604, "learning_rate": 1.5128205128205128e-06, "loss": 0.0215, "step": 1986 }, { "epoch": 16.982905982905983, "grad_norm": 15.651564598083496, "learning_rate": 1.5085470085470087e-06, "loss": 0.2954, "step": 1987 }, { "epoch": 16.99145299145299, "grad_norm": 0.48891735076904297, "learning_rate": 1.5042735042735044e-06, "loss": 0.015, "step": 1988 }, { "epoch": 17.0, "grad_norm": 7.363093376159668, "learning_rate": 1.5e-06, "loss": 0.2366, "step": 1989 }, { "epoch": 17.0, "eval_loss": 0.05406723916530609, "eval_runtime": 9.389, "eval_samples_per_second": 49.633, "eval_steps_per_second": 6.284, "step": 1989 }, { "epoch": 17.00854700854701, "grad_norm": 2.8626017570495605, "learning_rate": 1.4957264957264957e-06, "loss": 0.0902, "step": 1990 }, { "epoch": 17.017094017094017, "grad_norm": 2.461879253387451, "learning_rate": 1.4914529914529916e-06, "loss": 0.0387, "step": 1991 }, { "epoch": 17.025641025641026, "grad_norm": 6.336863994598389, "learning_rate": 1.4871794871794873e-06, "loss": 0.196, "step": 1992 }, { "epoch": 17.034188034188034, "grad_norm": 1.1044467687606812, "learning_rate": 1.482905982905983e-06, "loss": 0.0352, "step": 1993 }, { "epoch": 17.042735042735043, "grad_norm": 3.3509342670440674, "learning_rate": 1.4786324786324789e-06, "loss": 0.1459, "step": 1994 }, { "epoch": 17.05128205128205, "grad_norm": 3.2349629402160645, "learning_rate": 1.4743589743589745e-06, "loss": 0.0179, "step": 1995 }, { "epoch": 17.05982905982906, "grad_norm": 3.650749921798706, "learning_rate": 1.4700854700854702e-06, "loss": 0.1549, "step": 1996 }, { "epoch": 17.068376068376068, "grad_norm": 1.6349891424179077, "learning_rate": 1.465811965811966e-06, "loss": 0.0713, "step": 1997 }, { "epoch": 17.076923076923077, "grad_norm": 8.602070808410645, "learning_rate": 1.4615384615384618e-06, "loss": 0.3582, "step": 1998 }, { "epoch": 17.085470085470085, "grad_norm": 3.1162590980529785, "learning_rate": 1.4572649572649575e-06, "loss": 0.2455, "step": 1999 }, { "epoch": 17.094017094017094, "grad_norm": 1.4878407716751099, "learning_rate": 1.4529914529914531e-06, "loss": 0.0195, "step": 2000 }, { "epoch": 17.102564102564102, "grad_norm": 2.565297842025757, "learning_rate": 1.448717948717949e-06, "loss": 0.1126, "step": 2001 }, { "epoch": 17.11111111111111, "grad_norm": 4.169450759887695, "learning_rate": 1.4444444444444445e-06, "loss": 0.1774, "step": 2002 }, { "epoch": 17.11965811965812, "grad_norm": 1.8476792573928833, "learning_rate": 1.4401709401709402e-06, "loss": 0.0288, "step": 2003 }, { "epoch": 17.128205128205128, "grad_norm": 0.7279506921768188, "learning_rate": 1.4358974358974359e-06, "loss": 0.0217, "step": 2004 }, { "epoch": 17.136752136752136, "grad_norm": 7.387227535247803, "learning_rate": 1.4316239316239317e-06, "loss": 0.248, "step": 2005 }, { "epoch": 17.145299145299145, "grad_norm": 2.9455361366271973, "learning_rate": 1.4273504273504274e-06, "loss": 0.0439, "step": 2006 }, { "epoch": 17.153846153846153, "grad_norm": 6.015694618225098, "learning_rate": 1.423076923076923e-06, "loss": 0.0656, "step": 2007 }, { "epoch": 17.162393162393162, "grad_norm": 1.741774320602417, "learning_rate": 1.4188034188034188e-06, "loss": 0.0344, "step": 2008 }, { "epoch": 17.17094017094017, "grad_norm": 0.5282659530639648, "learning_rate": 1.4145299145299147e-06, "loss": 0.0128, "step": 2009 }, { "epoch": 17.17948717948718, "grad_norm": 2.4927468299865723, "learning_rate": 1.4102564102564104e-06, "loss": 0.1839, "step": 2010 }, { "epoch": 17.188034188034187, "grad_norm": 0.7872166037559509, "learning_rate": 1.405982905982906e-06, "loss": 0.0204, "step": 2011 }, { "epoch": 17.196581196581196, "grad_norm": 0.7072253227233887, "learning_rate": 1.401709401709402e-06, "loss": 0.0206, "step": 2012 }, { "epoch": 17.205128205128204, "grad_norm": 1.0154236555099487, "learning_rate": 1.3974358974358976e-06, "loss": 0.0238, "step": 2013 }, { "epoch": 17.213675213675213, "grad_norm": 2.9798424243927, "learning_rate": 1.3931623931623933e-06, "loss": 0.0542, "step": 2014 }, { "epoch": 17.22222222222222, "grad_norm": 0.9568426012992859, "learning_rate": 1.3888888888888892e-06, "loss": 0.0239, "step": 2015 }, { "epoch": 17.23076923076923, "grad_norm": 10.525039672851562, "learning_rate": 1.3846153846153848e-06, "loss": 0.1768, "step": 2016 }, { "epoch": 17.23931623931624, "grad_norm": 1.697314977645874, "learning_rate": 1.3803418803418805e-06, "loss": 0.0453, "step": 2017 }, { "epoch": 17.247863247863247, "grad_norm": 0.6436419486999512, "learning_rate": 1.3760683760683762e-06, "loss": 0.0163, "step": 2018 }, { "epoch": 17.256410256410255, "grad_norm": 4.984555721282959, "learning_rate": 1.371794871794872e-06, "loss": 0.1157, "step": 2019 }, { "epoch": 17.264957264957264, "grad_norm": 9.088909149169922, "learning_rate": 1.3675213675213678e-06, "loss": 0.2842, "step": 2020 }, { "epoch": 17.273504273504273, "grad_norm": 10.398246765136719, "learning_rate": 1.3632478632478632e-06, "loss": 0.2528, "step": 2021 }, { "epoch": 17.28205128205128, "grad_norm": 3.60273814201355, "learning_rate": 1.358974358974359e-06, "loss": 0.1799, "step": 2022 }, { "epoch": 17.29059829059829, "grad_norm": 0.6845250129699707, "learning_rate": 1.3547008547008548e-06, "loss": 0.0196, "step": 2023 }, { "epoch": 17.299145299145298, "grad_norm": 0.5363795161247253, "learning_rate": 1.3504273504273505e-06, "loss": 0.0136, "step": 2024 }, { "epoch": 17.307692307692307, "grad_norm": 3.880434274673462, "learning_rate": 1.3461538461538462e-06, "loss": 0.3665, "step": 2025 }, { "epoch": 17.316239316239315, "grad_norm": 4.580989360809326, "learning_rate": 1.3418803418803418e-06, "loss": 0.2593, "step": 2026 }, { "epoch": 17.324786324786324, "grad_norm": 2.781501293182373, "learning_rate": 1.3376068376068377e-06, "loss": 0.1777, "step": 2027 }, { "epoch": 17.333333333333332, "grad_norm": 5.605004787445068, "learning_rate": 1.3333333333333334e-06, "loss": 0.3633, "step": 2028 }, { "epoch": 17.34188034188034, "grad_norm": 1.696486473083496, "learning_rate": 1.329059829059829e-06, "loss": 0.0353, "step": 2029 }, { "epoch": 17.35042735042735, "grad_norm": 3.4415268898010254, "learning_rate": 1.324786324786325e-06, "loss": 0.0906, "step": 2030 }, { "epoch": 17.358974358974358, "grad_norm": 7.722592353820801, "learning_rate": 1.3205128205128207e-06, "loss": 0.1804, "step": 2031 }, { "epoch": 17.367521367521366, "grad_norm": 3.3161542415618896, "learning_rate": 1.3162393162393163e-06, "loss": 0.1336, "step": 2032 }, { "epoch": 17.376068376068375, "grad_norm": 2.568871021270752, "learning_rate": 1.3119658119658122e-06, "loss": 0.0658, "step": 2033 }, { "epoch": 17.384615384615383, "grad_norm": 3.5799806118011475, "learning_rate": 1.307692307692308e-06, "loss": 0.0652, "step": 2034 }, { "epoch": 17.39316239316239, "grad_norm": 1.1399949789047241, "learning_rate": 1.3034188034188036e-06, "loss": 0.0196, "step": 2035 }, { "epoch": 17.4017094017094, "grad_norm": 2.3688738346099854, "learning_rate": 1.2991452991452993e-06, "loss": 0.0706, "step": 2036 }, { "epoch": 17.41025641025641, "grad_norm": 12.726486206054688, "learning_rate": 1.2948717948717952e-06, "loss": 0.2506, "step": 2037 }, { "epoch": 17.418803418803417, "grad_norm": 2.249285936355591, "learning_rate": 1.2905982905982908e-06, "loss": 0.0532, "step": 2038 }, { "epoch": 17.427350427350426, "grad_norm": 0.7129601836204529, "learning_rate": 1.2863247863247863e-06, "loss": 0.0207, "step": 2039 }, { "epoch": 17.435897435897434, "grad_norm": 1.9362183809280396, "learning_rate": 1.282051282051282e-06, "loss": 0.0311, "step": 2040 }, { "epoch": 17.444444444444443, "grad_norm": 2.253690242767334, "learning_rate": 1.2777777777777779e-06, "loss": 0.1203, "step": 2041 }, { "epoch": 17.45299145299145, "grad_norm": 3.835174798965454, "learning_rate": 1.2735042735042736e-06, "loss": 0.0928, "step": 2042 }, { "epoch": 17.46153846153846, "grad_norm": 143.36563110351562, "learning_rate": 1.2692307692307692e-06, "loss": 0.2984, "step": 2043 }, { "epoch": 17.47008547008547, "grad_norm": 0.6122754216194153, "learning_rate": 1.264957264957265e-06, "loss": 0.0171, "step": 2044 }, { "epoch": 17.478632478632477, "grad_norm": 3.0697991847991943, "learning_rate": 1.2606837606837608e-06, "loss": 0.1412, "step": 2045 }, { "epoch": 17.487179487179485, "grad_norm": 1.0684096813201904, "learning_rate": 1.2564102564102565e-06, "loss": 0.0278, "step": 2046 }, { "epoch": 17.495726495726494, "grad_norm": 5.379480838775635, "learning_rate": 1.2521367521367522e-06, "loss": 0.1114, "step": 2047 }, { "epoch": 17.504273504273506, "grad_norm": 3.893343448638916, "learning_rate": 1.247863247863248e-06, "loss": 0.1499, "step": 2048 }, { "epoch": 17.51282051282051, "grad_norm": 1.0436211824417114, "learning_rate": 1.2435897435897437e-06, "loss": 0.0259, "step": 2049 }, { "epoch": 17.521367521367523, "grad_norm": 2.8706037998199463, "learning_rate": 1.2393162393162394e-06, "loss": 0.1071, "step": 2050 }, { "epoch": 17.52991452991453, "grad_norm": 1.5661158561706543, "learning_rate": 1.2350427350427353e-06, "loss": 0.0392, "step": 2051 }, { "epoch": 17.53846153846154, "grad_norm": 3.7152199745178223, "learning_rate": 1.230769230769231e-06, "loss": 0.0698, "step": 2052 }, { "epoch": 17.54700854700855, "grad_norm": 2.6527271270751953, "learning_rate": 1.2264957264957264e-06, "loss": 0.1276, "step": 2053 }, { "epoch": 17.555555555555557, "grad_norm": 0.9018534421920776, "learning_rate": 1.2222222222222223e-06, "loss": 0.066, "step": 2054 }, { "epoch": 17.564102564102566, "grad_norm": 7.11035680770874, "learning_rate": 1.217948717948718e-06, "loss": 0.0836, "step": 2055 }, { "epoch": 17.572649572649574, "grad_norm": 2.5168066024780273, "learning_rate": 1.2136752136752137e-06, "loss": 0.0662, "step": 2056 }, { "epoch": 17.581196581196583, "grad_norm": 0.7215616703033447, "learning_rate": 1.2094017094017096e-06, "loss": 0.0186, "step": 2057 }, { "epoch": 17.58974358974359, "grad_norm": 7.076876640319824, "learning_rate": 1.2051282051282053e-06, "loss": 0.1493, "step": 2058 }, { "epoch": 17.5982905982906, "grad_norm": 1.1687662601470947, "learning_rate": 1.200854700854701e-06, "loss": 0.0368, "step": 2059 }, { "epoch": 17.60683760683761, "grad_norm": 2.5085737705230713, "learning_rate": 1.1965811965811968e-06, "loss": 0.1567, "step": 2060 }, { "epoch": 17.615384615384617, "grad_norm": 0.43566644191741943, "learning_rate": 1.1923076923076925e-06, "loss": 0.0097, "step": 2061 }, { "epoch": 17.623931623931625, "grad_norm": 0.7698078155517578, "learning_rate": 1.188034188034188e-06, "loss": 0.0231, "step": 2062 }, { "epoch": 17.632478632478634, "grad_norm": 1.8352185487747192, "learning_rate": 1.1837606837606839e-06, "loss": 0.0324, "step": 2063 }, { "epoch": 17.641025641025642, "grad_norm": 12.11907958984375, "learning_rate": 1.1794871794871795e-06, "loss": 0.6052, "step": 2064 }, { "epoch": 17.64957264957265, "grad_norm": 0.49942728877067566, "learning_rate": 1.1752136752136752e-06, "loss": 0.0111, "step": 2065 }, { "epoch": 17.65811965811966, "grad_norm": 3.579129457473755, "learning_rate": 1.1709401709401711e-06, "loss": 0.1706, "step": 2066 }, { "epoch": 17.666666666666668, "grad_norm": 2.112550973892212, "learning_rate": 1.1666666666666668e-06, "loss": 0.0438, "step": 2067 }, { "epoch": 17.675213675213676, "grad_norm": 2.4429895877838135, "learning_rate": 1.1623931623931625e-06, "loss": 0.0498, "step": 2068 }, { "epoch": 17.683760683760685, "grad_norm": 1.8436684608459473, "learning_rate": 1.1581196581196584e-06, "loss": 0.1228, "step": 2069 }, { "epoch": 17.692307692307693, "grad_norm": 4.679569244384766, "learning_rate": 1.153846153846154e-06, "loss": 0.1505, "step": 2070 }, { "epoch": 17.700854700854702, "grad_norm": 2.4409713745117188, "learning_rate": 1.1495726495726495e-06, "loss": 0.0603, "step": 2071 }, { "epoch": 17.70940170940171, "grad_norm": 3.577721118927002, "learning_rate": 1.1452991452991454e-06, "loss": 0.1078, "step": 2072 }, { "epoch": 17.71794871794872, "grad_norm": 3.774958372116089, "learning_rate": 1.141025641025641e-06, "loss": 0.3782, "step": 2073 }, { "epoch": 17.726495726495727, "grad_norm": 2.9011383056640625, "learning_rate": 1.1367521367521368e-06, "loss": 0.0714, "step": 2074 }, { "epoch": 17.735042735042736, "grad_norm": 1.7296162843704224, "learning_rate": 1.1324786324786326e-06, "loss": 0.0463, "step": 2075 }, { "epoch": 17.743589743589745, "grad_norm": 1.8955838680267334, "learning_rate": 1.1282051282051283e-06, "loss": 0.0641, "step": 2076 }, { "epoch": 17.752136752136753, "grad_norm": 3.0198490619659424, "learning_rate": 1.123931623931624e-06, "loss": 0.1516, "step": 2077 }, { "epoch": 17.76068376068376, "grad_norm": 1.5012823343276978, "learning_rate": 1.1196581196581199e-06, "loss": 0.0206, "step": 2078 }, { "epoch": 17.76923076923077, "grad_norm": 2.4390790462493896, "learning_rate": 1.1153846153846156e-06, "loss": 0.0458, "step": 2079 }, { "epoch": 17.77777777777778, "grad_norm": 5.728135585784912, "learning_rate": 1.111111111111111e-06, "loss": 0.0443, "step": 2080 }, { "epoch": 17.786324786324787, "grad_norm": 1.423771858215332, "learning_rate": 1.106837606837607e-06, "loss": 0.0223, "step": 2081 }, { "epoch": 17.794871794871796, "grad_norm": 2.524941921234131, "learning_rate": 1.1025641025641026e-06, "loss": 0.0587, "step": 2082 }, { "epoch": 17.803418803418804, "grad_norm": 0.9632331132888794, "learning_rate": 1.0982905982905983e-06, "loss": 0.0324, "step": 2083 }, { "epoch": 17.811965811965813, "grad_norm": 1.8369181156158447, "learning_rate": 1.0940170940170942e-06, "loss": 0.0182, "step": 2084 }, { "epoch": 17.82051282051282, "grad_norm": 2.547654867172241, "learning_rate": 1.0897435897435899e-06, "loss": 0.1395, "step": 2085 }, { "epoch": 17.82905982905983, "grad_norm": 3.516977310180664, "learning_rate": 1.0854700854700855e-06, "loss": 0.1044, "step": 2086 }, { "epoch": 17.837606837606838, "grad_norm": 1.7064217329025269, "learning_rate": 1.0811965811965814e-06, "loss": 0.0302, "step": 2087 }, { "epoch": 17.846153846153847, "grad_norm": 1.7427505254745483, "learning_rate": 1.076923076923077e-06, "loss": 0.0298, "step": 2088 }, { "epoch": 17.854700854700855, "grad_norm": 1.3395370244979858, "learning_rate": 1.0726495726495726e-06, "loss": 0.0302, "step": 2089 }, { "epoch": 17.863247863247864, "grad_norm": 7.244344711303711, "learning_rate": 1.0683760683760685e-06, "loss": 0.1925, "step": 2090 }, { "epoch": 17.871794871794872, "grad_norm": 5.942878723144531, "learning_rate": 1.0641025641025641e-06, "loss": 0.489, "step": 2091 }, { "epoch": 17.88034188034188, "grad_norm": 3.244260787963867, "learning_rate": 1.0598290598290598e-06, "loss": 0.2538, "step": 2092 }, { "epoch": 17.88888888888889, "grad_norm": 0.9833334684371948, "learning_rate": 1.0555555555555557e-06, "loss": 0.0215, "step": 2093 }, { "epoch": 17.897435897435898, "grad_norm": 3.0194849967956543, "learning_rate": 1.0512820512820514e-06, "loss": 0.07, "step": 2094 }, { "epoch": 17.905982905982906, "grad_norm": 0.48535388708114624, "learning_rate": 1.047008547008547e-06, "loss": 0.0113, "step": 2095 }, { "epoch": 17.914529914529915, "grad_norm": 4.334452152252197, "learning_rate": 1.042735042735043e-06, "loss": 0.127, "step": 2096 }, { "epoch": 17.923076923076923, "grad_norm": 3.54429030418396, "learning_rate": 1.0384615384615386e-06, "loss": 0.0704, "step": 2097 }, { "epoch": 17.931623931623932, "grad_norm": 1.1745219230651855, "learning_rate": 1.034188034188034e-06, "loss": 0.0418, "step": 2098 }, { "epoch": 17.94017094017094, "grad_norm": 5.157544136047363, "learning_rate": 1.02991452991453e-06, "loss": 0.2562, "step": 2099 }, { "epoch": 17.94871794871795, "grad_norm": 4.454767227172852, "learning_rate": 1.0256410256410257e-06, "loss": 0.1141, "step": 2100 }, { "epoch": 17.957264957264957, "grad_norm": 12.859573364257812, "learning_rate": 1.0213675213675213e-06, "loss": 0.3516, "step": 2101 }, { "epoch": 17.965811965811966, "grad_norm": 5.780513763427734, "learning_rate": 1.0170940170940172e-06, "loss": 0.1663, "step": 2102 }, { "epoch": 17.974358974358974, "grad_norm": 2.762153387069702, "learning_rate": 1.012820512820513e-06, "loss": 0.19, "step": 2103 }, { "epoch": 17.982905982905983, "grad_norm": 5.649252891540527, "learning_rate": 1.0085470085470086e-06, "loss": 0.1736, "step": 2104 }, { "epoch": 17.99145299145299, "grad_norm": 5.10836124420166, "learning_rate": 1.0042735042735045e-06, "loss": 0.1739, "step": 2105 }, { "epoch": 18.0, "grad_norm": 6.474237442016602, "learning_rate": 1.0000000000000002e-06, "loss": 0.3239, "step": 2106 }, { "epoch": 18.0, "eval_loss": 0.052614517509937286, "eval_runtime": 9.28, "eval_samples_per_second": 50.216, "eval_steps_per_second": 6.358, "step": 2106 }, { "epoch": 18.00854700854701, "grad_norm": 0.8820164203643799, "learning_rate": 9.957264957264958e-07, "loss": 0.0237, "step": 2107 }, { "epoch": 18.017094017094017, "grad_norm": 2.692166566848755, "learning_rate": 9.914529914529915e-07, "loss": 0.0962, "step": 2108 }, { "epoch": 18.025641025641026, "grad_norm": 0.8048399090766907, "learning_rate": 9.871794871794872e-07, "loss": 0.0232, "step": 2109 }, { "epoch": 18.034188034188034, "grad_norm": 4.4439826011657715, "learning_rate": 9.829059829059829e-07, "loss": 0.064, "step": 2110 }, { "epoch": 18.042735042735043, "grad_norm": 1.62433660030365, "learning_rate": 9.786324786324788e-07, "loss": 0.1263, "step": 2111 }, { "epoch": 18.05128205128205, "grad_norm": 4.766104221343994, "learning_rate": 9.743589743589745e-07, "loss": 0.2108, "step": 2112 }, { "epoch": 18.05982905982906, "grad_norm": 139.34445190429688, "learning_rate": 9.700854700854701e-07, "loss": 0.237, "step": 2113 }, { "epoch": 18.068376068376068, "grad_norm": 0.6069220900535583, "learning_rate": 9.65811965811966e-07, "loss": 0.0135, "step": 2114 }, { "epoch": 18.076923076923077, "grad_norm": 2.7833995819091797, "learning_rate": 9.615384615384617e-07, "loss": 0.1677, "step": 2115 }, { "epoch": 18.085470085470085, "grad_norm": 4.570268630981445, "learning_rate": 9.572649572649574e-07, "loss": 0.2304, "step": 2116 }, { "epoch": 18.094017094017094, "grad_norm": 4.7644805908203125, "learning_rate": 9.529914529914531e-07, "loss": 0.138, "step": 2117 }, { "epoch": 18.102564102564102, "grad_norm": 1.9438762664794922, "learning_rate": 9.487179487179487e-07, "loss": 0.0488, "step": 2118 }, { "epoch": 18.11111111111111, "grad_norm": 1.4188040494918823, "learning_rate": 9.444444444444445e-07, "loss": 0.0545, "step": 2119 }, { "epoch": 18.11965811965812, "grad_norm": 0.357928603887558, "learning_rate": 9.401709401709402e-07, "loss": 0.0092, "step": 2120 }, { "epoch": 18.128205128205128, "grad_norm": 1.8646256923675537, "learning_rate": 9.35897435897436e-07, "loss": 0.086, "step": 2121 }, { "epoch": 18.136752136752136, "grad_norm": 2.111544609069824, "learning_rate": 9.316239316239318e-07, "loss": 0.0319, "step": 2122 }, { "epoch": 18.145299145299145, "grad_norm": 3.0686893463134766, "learning_rate": 9.273504273504274e-07, "loss": 0.0689, "step": 2123 }, { "epoch": 18.153846153846153, "grad_norm": 4.028079509735107, "learning_rate": 9.230769230769232e-07, "loss": 0.125, "step": 2124 }, { "epoch": 18.162393162393162, "grad_norm": 1.0433181524276733, "learning_rate": 9.188034188034189e-07, "loss": 0.0174, "step": 2125 }, { "epoch": 18.17094017094017, "grad_norm": 3.4533402919769287, "learning_rate": 9.145299145299146e-07, "loss": 0.1556, "step": 2126 }, { "epoch": 18.17948717948718, "grad_norm": 11.187241554260254, "learning_rate": 9.102564102564103e-07, "loss": 0.2578, "step": 2127 }, { "epoch": 18.188034188034187, "grad_norm": 2.544975757598877, "learning_rate": 9.05982905982906e-07, "loss": 0.0868, "step": 2128 }, { "epoch": 18.196581196581196, "grad_norm": 2.490493059158325, "learning_rate": 9.017094017094017e-07, "loss": 0.1575, "step": 2129 }, { "epoch": 18.205128205128204, "grad_norm": 4.665895938873291, "learning_rate": 8.974358974358975e-07, "loss": 0.1644, "step": 2130 }, { "epoch": 18.213675213675213, "grad_norm": 3.135772943496704, "learning_rate": 8.931623931623933e-07, "loss": 0.205, "step": 2131 }, { "epoch": 18.22222222222222, "grad_norm": 1.5636606216430664, "learning_rate": 8.88888888888889e-07, "loss": 0.0541, "step": 2132 }, { "epoch": 18.23076923076923, "grad_norm": 3.603691816329956, "learning_rate": 8.846153846153848e-07, "loss": 0.0478, "step": 2133 }, { "epoch": 18.23931623931624, "grad_norm": 2.6537222862243652, "learning_rate": 8.803418803418804e-07, "loss": 0.1206, "step": 2134 }, { "epoch": 18.247863247863247, "grad_norm": 5.086421966552734, "learning_rate": 8.760683760683761e-07, "loss": 0.1212, "step": 2135 }, { "epoch": 18.256410256410255, "grad_norm": 4.673394203186035, "learning_rate": 8.717948717948718e-07, "loss": 0.0588, "step": 2136 }, { "epoch": 18.264957264957264, "grad_norm": 2.1376845836639404, "learning_rate": 8.675213675213676e-07, "loss": 0.0492, "step": 2137 }, { "epoch": 18.273504273504273, "grad_norm": 2.8616504669189453, "learning_rate": 8.632478632478633e-07, "loss": 0.1834, "step": 2138 }, { "epoch": 18.28205128205128, "grad_norm": 2.7179784774780273, "learning_rate": 8.58974358974359e-07, "loss": 0.1508, "step": 2139 }, { "epoch": 18.29059829059829, "grad_norm": 1.1909416913986206, "learning_rate": 8.547008547008548e-07, "loss": 0.0721, "step": 2140 }, { "epoch": 18.299145299145298, "grad_norm": 1.8272216320037842, "learning_rate": 8.504273504273505e-07, "loss": 0.0797, "step": 2141 }, { "epoch": 18.307692307692307, "grad_norm": 4.394528388977051, "learning_rate": 8.461538461538463e-07, "loss": 0.2762, "step": 2142 }, { "epoch": 18.316239316239315, "grad_norm": 4.276169776916504, "learning_rate": 8.41880341880342e-07, "loss": 0.0969, "step": 2143 }, { "epoch": 18.324786324786324, "grad_norm": 2.0932376384735107, "learning_rate": 8.376068376068377e-07, "loss": 0.0595, "step": 2144 }, { "epoch": 18.333333333333332, "grad_norm": 5.714378833770752, "learning_rate": 8.333333333333333e-07, "loss": 0.1176, "step": 2145 }, { "epoch": 18.34188034188034, "grad_norm": 1.1050394773483276, "learning_rate": 8.290598290598291e-07, "loss": 0.0284, "step": 2146 }, { "epoch": 18.35042735042735, "grad_norm": 3.2809271812438965, "learning_rate": 8.247863247863248e-07, "loss": 0.0737, "step": 2147 }, { "epoch": 18.358974358974358, "grad_norm": 2.102889060974121, "learning_rate": 8.205128205128206e-07, "loss": 0.0477, "step": 2148 }, { "epoch": 18.367521367521366, "grad_norm": 1.5728402137756348, "learning_rate": 8.162393162393164e-07, "loss": 0.0476, "step": 2149 }, { "epoch": 18.376068376068375, "grad_norm": 2.0337905883789062, "learning_rate": 8.11965811965812e-07, "loss": 0.019, "step": 2150 }, { "epoch": 18.384615384615383, "grad_norm": 5.475340843200684, "learning_rate": 8.076923076923078e-07, "loss": 0.1625, "step": 2151 }, { "epoch": 18.39316239316239, "grad_norm": 0.4993753135204315, "learning_rate": 8.034188034188035e-07, "loss": 0.0132, "step": 2152 }, { "epoch": 18.4017094017094, "grad_norm": 4.052933216094971, "learning_rate": 7.991452991452992e-07, "loss": 0.1603, "step": 2153 }, { "epoch": 18.41025641025641, "grad_norm": 3.005293607711792, "learning_rate": 7.948717948717949e-07, "loss": 0.0399, "step": 2154 }, { "epoch": 18.418803418803417, "grad_norm": 3.0186731815338135, "learning_rate": 7.905982905982906e-07, "loss": 0.0564, "step": 2155 }, { "epoch": 18.427350427350426, "grad_norm": 5.522226333618164, "learning_rate": 7.863247863247863e-07, "loss": 0.1138, "step": 2156 }, { "epoch": 18.435897435897434, "grad_norm": 5.463916301727295, "learning_rate": 7.820512820512821e-07, "loss": 0.4811, "step": 2157 }, { "epoch": 18.444444444444443, "grad_norm": 0.41404595971107483, "learning_rate": 7.777777777777779e-07, "loss": 0.0114, "step": 2158 }, { "epoch": 18.45299145299145, "grad_norm": 0.9279537200927734, "learning_rate": 7.735042735042736e-07, "loss": 0.0268, "step": 2159 }, { "epoch": 18.46153846153846, "grad_norm": 0.5745738744735718, "learning_rate": 7.692307692307694e-07, "loss": 0.0155, "step": 2160 }, { "epoch": 18.47008547008547, "grad_norm": 2.329507827758789, "learning_rate": 7.64957264957265e-07, "loss": 0.0421, "step": 2161 }, { "epoch": 18.478632478632477, "grad_norm": 2.934424638748169, "learning_rate": 7.606837606837607e-07, "loss": 0.0925, "step": 2162 }, { "epoch": 18.487179487179485, "grad_norm": 3.226261854171753, "learning_rate": 7.564102564102564e-07, "loss": 0.1914, "step": 2163 }, { "epoch": 18.495726495726494, "grad_norm": 1.2033684253692627, "learning_rate": 7.521367521367522e-07, "loss": 0.0218, "step": 2164 }, { "epoch": 18.504273504273506, "grad_norm": 1.092015266418457, "learning_rate": 7.478632478632479e-07, "loss": 0.0165, "step": 2165 }, { "epoch": 18.51282051282051, "grad_norm": 1.2283809185028076, "learning_rate": 7.435897435897436e-07, "loss": 0.025, "step": 2166 }, { "epoch": 18.521367521367523, "grad_norm": 6.3457722663879395, "learning_rate": 7.393162393162394e-07, "loss": 0.2224, "step": 2167 }, { "epoch": 18.52991452991453, "grad_norm": 4.920536518096924, "learning_rate": 7.350427350427351e-07, "loss": 0.1381, "step": 2168 }, { "epoch": 18.53846153846154, "grad_norm": 4.16088342666626, "learning_rate": 7.307692307692309e-07, "loss": 0.2725, "step": 2169 }, { "epoch": 18.54700854700855, "grad_norm": 1.4776932001113892, "learning_rate": 7.264957264957266e-07, "loss": 0.0236, "step": 2170 }, { "epoch": 18.555555555555557, "grad_norm": 5.517492294311523, "learning_rate": 7.222222222222222e-07, "loss": 0.3427, "step": 2171 }, { "epoch": 18.564102564102566, "grad_norm": 0.7798398733139038, "learning_rate": 7.179487179487179e-07, "loss": 0.0139, "step": 2172 }, { "epoch": 18.572649572649574, "grad_norm": 0.7174245119094849, "learning_rate": 7.136752136752137e-07, "loss": 0.0144, "step": 2173 }, { "epoch": 18.581196581196583, "grad_norm": 5.118779182434082, "learning_rate": 7.094017094017094e-07, "loss": 0.1899, "step": 2174 }, { "epoch": 18.58974358974359, "grad_norm": 2.8726353645324707, "learning_rate": 7.051282051282052e-07, "loss": 0.1177, "step": 2175 }, { "epoch": 18.5982905982906, "grad_norm": 2.3775036334991455, "learning_rate": 7.00854700854701e-07, "loss": 0.1183, "step": 2176 }, { "epoch": 18.60683760683761, "grad_norm": 19.23975944519043, "learning_rate": 6.965811965811966e-07, "loss": 0.4534, "step": 2177 }, { "epoch": 18.615384615384617, "grad_norm": 1.3832803964614868, "learning_rate": 6.923076923076924e-07, "loss": 0.0309, "step": 2178 }, { "epoch": 18.623931623931625, "grad_norm": 1.6752214431762695, "learning_rate": 6.880341880341881e-07, "loss": 0.0201, "step": 2179 }, { "epoch": 18.632478632478634, "grad_norm": 3.1885950565338135, "learning_rate": 6.837606837606839e-07, "loss": 0.1242, "step": 2180 }, { "epoch": 18.641025641025642, "grad_norm": 0.9290790557861328, "learning_rate": 6.794871794871795e-07, "loss": 0.0189, "step": 2181 }, { "epoch": 18.64957264957265, "grad_norm": 0.25725051760673523, "learning_rate": 6.752136752136752e-07, "loss": 0.0065, "step": 2182 }, { "epoch": 18.65811965811966, "grad_norm": 1.9815839529037476, "learning_rate": 6.709401709401709e-07, "loss": 0.0576, "step": 2183 }, { "epoch": 18.666666666666668, "grad_norm": 1.924490213394165, "learning_rate": 6.666666666666667e-07, "loss": 0.0671, "step": 2184 }, { "epoch": 18.675213675213676, "grad_norm": 2.9947164058685303, "learning_rate": 6.623931623931625e-07, "loss": 0.1859, "step": 2185 }, { "epoch": 18.683760683760685, "grad_norm": 1.8680211305618286, "learning_rate": 6.581196581196582e-07, "loss": 0.1028, "step": 2186 }, { "epoch": 18.692307692307693, "grad_norm": 0.823103666305542, "learning_rate": 6.53846153846154e-07, "loss": 0.0198, "step": 2187 }, { "epoch": 18.700854700854702, "grad_norm": 2.3616061210632324, "learning_rate": 6.495726495726496e-07, "loss": 0.1025, "step": 2188 }, { "epoch": 18.70940170940171, "grad_norm": 3.1370067596435547, "learning_rate": 6.452991452991454e-07, "loss": 0.0438, "step": 2189 }, { "epoch": 18.71794871794872, "grad_norm": 8.058025360107422, "learning_rate": 6.41025641025641e-07, "loss": 0.082, "step": 2190 }, { "epoch": 18.726495726495727, "grad_norm": 2.1969916820526123, "learning_rate": 6.367521367521368e-07, "loss": 0.1074, "step": 2191 }, { "epoch": 18.735042735042736, "grad_norm": 2.5845255851745605, "learning_rate": 6.324786324786325e-07, "loss": 0.0795, "step": 2192 }, { "epoch": 18.743589743589745, "grad_norm": 3.578331708908081, "learning_rate": 6.282051282051282e-07, "loss": 0.1111, "step": 2193 }, { "epoch": 18.752136752136753, "grad_norm": 1.5390626192092896, "learning_rate": 6.23931623931624e-07, "loss": 0.064, "step": 2194 }, { "epoch": 18.76068376068376, "grad_norm": 3.1742804050445557, "learning_rate": 6.196581196581197e-07, "loss": 0.0971, "step": 2195 }, { "epoch": 18.76923076923077, "grad_norm": 1.7017542123794556, "learning_rate": 6.153846153846155e-07, "loss": 0.0424, "step": 2196 }, { "epoch": 18.77777777777778, "grad_norm": 2.642102003097534, "learning_rate": 6.111111111111112e-07, "loss": 0.1243, "step": 2197 }, { "epoch": 18.786324786324787, "grad_norm": 1.2010291814804077, "learning_rate": 6.068376068376068e-07, "loss": 0.0375, "step": 2198 }, { "epoch": 18.794871794871796, "grad_norm": 3.1580190658569336, "learning_rate": 6.025641025641026e-07, "loss": 0.0565, "step": 2199 }, { "epoch": 18.803418803418804, "grad_norm": 2.7660391330718994, "learning_rate": 5.982905982905984e-07, "loss": 0.0385, "step": 2200 }, { "epoch": 18.811965811965813, "grad_norm": 0.7716617584228516, "learning_rate": 5.94017094017094e-07, "loss": 0.0159, "step": 2201 }, { "epoch": 18.82051282051282, "grad_norm": 3.190251588821411, "learning_rate": 5.897435897435898e-07, "loss": 0.241, "step": 2202 }, { "epoch": 18.82905982905983, "grad_norm": 7.115220069885254, "learning_rate": 5.854700854700856e-07, "loss": 0.1777, "step": 2203 }, { "epoch": 18.837606837606838, "grad_norm": 5.071573257446289, "learning_rate": 5.811965811965812e-07, "loss": 0.5421, "step": 2204 }, { "epoch": 18.846153846153847, "grad_norm": 3.8419785499572754, "learning_rate": 5.76923076923077e-07, "loss": 0.0784, "step": 2205 }, { "epoch": 18.854700854700855, "grad_norm": 2.8234896659851074, "learning_rate": 5.726495726495727e-07, "loss": 0.1071, "step": 2206 }, { "epoch": 18.863247863247864, "grad_norm": 1.4067480564117432, "learning_rate": 5.683760683760684e-07, "loss": 0.0375, "step": 2207 }, { "epoch": 18.871794871794872, "grad_norm": 2.508589029312134, "learning_rate": 5.641025641025642e-07, "loss": 0.0921, "step": 2208 }, { "epoch": 18.88034188034188, "grad_norm": 7.314038276672363, "learning_rate": 5.598290598290599e-07, "loss": 0.3581, "step": 2209 }, { "epoch": 18.88888888888889, "grad_norm": 4.375041961669922, "learning_rate": 5.555555555555555e-07, "loss": 0.1115, "step": 2210 }, { "epoch": 18.897435897435898, "grad_norm": 4.789741516113281, "learning_rate": 5.512820512820513e-07, "loss": 0.1813, "step": 2211 }, { "epoch": 18.905982905982906, "grad_norm": 3.008720874786377, "learning_rate": 5.470085470085471e-07, "loss": 0.104, "step": 2212 }, { "epoch": 18.914529914529915, "grad_norm": 0.6364433765411377, "learning_rate": 5.427350427350428e-07, "loss": 0.0153, "step": 2213 }, { "epoch": 18.923076923076923, "grad_norm": 1.4009958505630493, "learning_rate": 5.384615384615386e-07, "loss": 0.0499, "step": 2214 }, { "epoch": 18.931623931623932, "grad_norm": 4.53135347366333, "learning_rate": 5.341880341880342e-07, "loss": 0.1021, "step": 2215 }, { "epoch": 18.94017094017094, "grad_norm": 0.7855163216590881, "learning_rate": 5.299145299145299e-07, "loss": 0.0297, "step": 2216 }, { "epoch": 18.94871794871795, "grad_norm": 1.5316343307495117, "learning_rate": 5.256410256410257e-07, "loss": 0.0438, "step": 2217 }, { "epoch": 18.957264957264957, "grad_norm": 1.2713849544525146, "learning_rate": 5.213675213675215e-07, "loss": 0.0311, "step": 2218 }, { "epoch": 18.965811965811966, "grad_norm": 1.612418293952942, "learning_rate": 5.17094017094017e-07, "loss": 0.0796, "step": 2219 }, { "epoch": 18.974358974358974, "grad_norm": 6.046596527099609, "learning_rate": 5.128205128205128e-07, "loss": 0.0835, "step": 2220 }, { "epoch": 18.982905982905983, "grad_norm": 2.527993679046631, "learning_rate": 5.085470085470086e-07, "loss": 0.0448, "step": 2221 }, { "epoch": 18.99145299145299, "grad_norm": 0.9519897699356079, "learning_rate": 5.042735042735043e-07, "loss": 0.0223, "step": 2222 }, { "epoch": 19.0, "grad_norm": 14.08708667755127, "learning_rate": 5.000000000000001e-07, "loss": 0.6753, "step": 2223 }, { "epoch": 19.0, "eval_loss": 0.05170569196343422, "eval_runtime": 9.3972, "eval_samples_per_second": 49.589, "eval_steps_per_second": 6.278, "step": 2223 }, { "epoch": 19.00854700854701, "grad_norm": 5.215019702911377, "learning_rate": 4.957264957264958e-07, "loss": 0.1614, "step": 2224 }, { "epoch": 19.017094017094017, "grad_norm": 2.855567216873169, "learning_rate": 4.914529914529914e-07, "loss": 0.1051, "step": 2225 }, { "epoch": 19.025641025641026, "grad_norm": 4.078762054443359, "learning_rate": 4.871794871794872e-07, "loss": 0.2859, "step": 2226 }, { "epoch": 19.034188034188034, "grad_norm": 0.9259152412414551, "learning_rate": 4.82905982905983e-07, "loss": 0.0257, "step": 2227 }, { "epoch": 19.042735042735043, "grad_norm": 3.629925012588501, "learning_rate": 4.786324786324787e-07, "loss": 0.1283, "step": 2228 }, { "epoch": 19.05128205128205, "grad_norm": 3.104196310043335, "learning_rate": 4.7435897435897437e-07, "loss": 0.0701, "step": 2229 }, { "epoch": 19.05982905982906, "grad_norm": 8.760592460632324, "learning_rate": 4.700854700854701e-07, "loss": 0.5793, "step": 2230 }, { "epoch": 19.068376068376068, "grad_norm": 1.2966917753219604, "learning_rate": 4.658119658119659e-07, "loss": 0.0573, "step": 2231 }, { "epoch": 19.076923076923077, "grad_norm": 1.7045038938522339, "learning_rate": 4.615384615384616e-07, "loss": 0.0497, "step": 2232 }, { "epoch": 19.085470085470085, "grad_norm": 7.805142402648926, "learning_rate": 4.572649572649573e-07, "loss": 0.2898, "step": 2233 }, { "epoch": 19.094017094017094, "grad_norm": 0.5019100308418274, "learning_rate": 4.52991452991453e-07, "loss": 0.0132, "step": 2234 }, { "epoch": 19.102564102564102, "grad_norm": 3.1100540161132812, "learning_rate": 4.4871794871794876e-07, "loss": 0.0874, "step": 2235 }, { "epoch": 19.11111111111111, "grad_norm": 0.40422680974006653, "learning_rate": 4.444444444444445e-07, "loss": 0.012, "step": 2236 }, { "epoch": 19.11965811965812, "grad_norm": 1.2845938205718994, "learning_rate": 4.401709401709402e-07, "loss": 0.0259, "step": 2237 }, { "epoch": 19.128205128205128, "grad_norm": 4.621537208557129, "learning_rate": 4.358974358974359e-07, "loss": 0.246, "step": 2238 }, { "epoch": 19.136752136752136, "grad_norm": 1.1688278913497925, "learning_rate": 4.3162393162393163e-07, "loss": 0.0804, "step": 2239 }, { "epoch": 19.145299145299145, "grad_norm": 10.896872520446777, "learning_rate": 4.273504273504274e-07, "loss": 0.2695, "step": 2240 }, { "epoch": 19.153846153846153, "grad_norm": 2.7485415935516357, "learning_rate": 4.2307692307692315e-07, "loss": 0.0474, "step": 2241 }, { "epoch": 19.162393162393162, "grad_norm": 1.1686739921569824, "learning_rate": 4.188034188034188e-07, "loss": 0.0257, "step": 2242 }, { "epoch": 19.17094017094017, "grad_norm": 3.5579254627227783, "learning_rate": 4.1452991452991456e-07, "loss": 0.0419, "step": 2243 }, { "epoch": 19.17948717948718, "grad_norm": 3.088649034500122, "learning_rate": 4.102564102564103e-07, "loss": 0.1229, "step": 2244 }, { "epoch": 19.188034188034187, "grad_norm": 1.4894665479660034, "learning_rate": 4.05982905982906e-07, "loss": 0.0414, "step": 2245 }, { "epoch": 19.196581196581196, "grad_norm": 5.022091865539551, "learning_rate": 4.0170940170940175e-07, "loss": 0.1423, "step": 2246 }, { "epoch": 19.205128205128204, "grad_norm": 1.6117054224014282, "learning_rate": 3.9743589743589743e-07, "loss": 0.0244, "step": 2247 }, { "epoch": 19.213675213675213, "grad_norm": 0.5429085493087769, "learning_rate": 3.9316239316239316e-07, "loss": 0.0122, "step": 2248 }, { "epoch": 19.22222222222222, "grad_norm": 7.429282188415527, "learning_rate": 3.8888888888888895e-07, "loss": 0.122, "step": 2249 }, { "epoch": 19.23076923076923, "grad_norm": 4.492022514343262, "learning_rate": 3.846153846153847e-07, "loss": 0.3181, "step": 2250 }, { "epoch": 19.23931623931624, "grad_norm": 5.219499588012695, "learning_rate": 3.8034188034188036e-07, "loss": 0.1374, "step": 2251 }, { "epoch": 19.247863247863247, "grad_norm": 3.454345941543579, "learning_rate": 3.760683760683761e-07, "loss": 0.147, "step": 2252 }, { "epoch": 19.256410256410255, "grad_norm": 0.6370477080345154, "learning_rate": 3.717948717948718e-07, "loss": 0.0154, "step": 2253 }, { "epoch": 19.264957264957264, "grad_norm": 1.7189971208572388, "learning_rate": 3.6752136752136755e-07, "loss": 0.0635, "step": 2254 }, { "epoch": 19.273504273504273, "grad_norm": 2.716744899749756, "learning_rate": 3.632478632478633e-07, "loss": 0.0966, "step": 2255 }, { "epoch": 19.28205128205128, "grad_norm": 2.4959864616394043, "learning_rate": 3.5897435897435896e-07, "loss": 0.0779, "step": 2256 }, { "epoch": 19.29059829059829, "grad_norm": 3.625793218612671, "learning_rate": 3.547008547008547e-07, "loss": 0.3238, "step": 2257 }, { "epoch": 19.299145299145298, "grad_norm": 1.8783844709396362, "learning_rate": 3.504273504273505e-07, "loss": 0.0319, "step": 2258 }, { "epoch": 19.307692307692307, "grad_norm": 1.6740922927856445, "learning_rate": 3.461538461538462e-07, "loss": 0.0844, "step": 2259 }, { "epoch": 19.316239316239315, "grad_norm": 2.8891098499298096, "learning_rate": 3.4188034188034194e-07, "loss": 0.1916, "step": 2260 }, { "epoch": 19.324786324786324, "grad_norm": 0.9975456595420837, "learning_rate": 3.376068376068376e-07, "loss": 0.0266, "step": 2261 }, { "epoch": 19.333333333333332, "grad_norm": 2.576789379119873, "learning_rate": 3.3333333333333335e-07, "loss": 0.0722, "step": 2262 }, { "epoch": 19.34188034188034, "grad_norm": 9.070858001708984, "learning_rate": 3.290598290598291e-07, "loss": 0.2998, "step": 2263 }, { "epoch": 19.35042735042735, "grad_norm": 3.052319049835205, "learning_rate": 3.247863247863248e-07, "loss": 0.0435, "step": 2264 }, { "epoch": 19.358974358974358, "grad_norm": 0.8035821318626404, "learning_rate": 3.205128205128205e-07, "loss": 0.0233, "step": 2265 }, { "epoch": 19.367521367521366, "grad_norm": 3.7658371925354004, "learning_rate": 3.1623931623931623e-07, "loss": 0.3007, "step": 2266 }, { "epoch": 19.376068376068375, "grad_norm": 1.210494875907898, "learning_rate": 3.11965811965812e-07, "loss": 0.0344, "step": 2267 }, { "epoch": 19.384615384615383, "grad_norm": 1.1121772527694702, "learning_rate": 3.0769230769230774e-07, "loss": 0.054, "step": 2268 }, { "epoch": 19.39316239316239, "grad_norm": 2.842228412628174, "learning_rate": 3.034188034188034e-07, "loss": 0.0814, "step": 2269 }, { "epoch": 19.4017094017094, "grad_norm": 1.9269556999206543, "learning_rate": 2.991452991452992e-07, "loss": 0.0354, "step": 2270 }, { "epoch": 19.41025641025641, "grad_norm": 7.359715938568115, "learning_rate": 2.948717948717949e-07, "loss": 0.3288, "step": 2271 }, { "epoch": 19.418803418803417, "grad_norm": 1.7621564865112305, "learning_rate": 2.905982905982906e-07, "loss": 0.0313, "step": 2272 }, { "epoch": 19.427350427350426, "grad_norm": 2.5410284996032715, "learning_rate": 2.8632478632478635e-07, "loss": 0.076, "step": 2273 }, { "epoch": 19.435897435897434, "grad_norm": 5.633874416351318, "learning_rate": 2.820512820512821e-07, "loss": 0.1903, "step": 2274 }, { "epoch": 19.444444444444443, "grad_norm": 1.935703158378601, "learning_rate": 2.7777777777777776e-07, "loss": 0.3778, "step": 2275 }, { "epoch": 19.45299145299145, "grad_norm": 7.559366703033447, "learning_rate": 2.7350427350427354e-07, "loss": 0.2684, "step": 2276 }, { "epoch": 19.46153846153846, "grad_norm": 9.240869522094727, "learning_rate": 2.692307692307693e-07, "loss": 0.2982, "step": 2277 }, { "epoch": 19.47008547008547, "grad_norm": 6.940350532531738, "learning_rate": 2.6495726495726495e-07, "loss": 0.3131, "step": 2278 }, { "epoch": 19.478632478632477, "grad_norm": 1.3201594352722168, "learning_rate": 2.6068376068376074e-07, "loss": 0.0191, "step": 2279 }, { "epoch": 19.487179487179485, "grad_norm": 1.626806616783142, "learning_rate": 2.564102564102564e-07, "loss": 0.0361, "step": 2280 }, { "epoch": 19.495726495726494, "grad_norm": 8.687582969665527, "learning_rate": 2.5213675213675215e-07, "loss": 0.1942, "step": 2281 }, { "epoch": 19.504273504273506, "grad_norm": 5.104561805725098, "learning_rate": 2.478632478632479e-07, "loss": 0.1906, "step": 2282 }, { "epoch": 19.51282051282051, "grad_norm": 2.8611207008361816, "learning_rate": 2.435897435897436e-07, "loss": 0.1258, "step": 2283 }, { "epoch": 19.521367521367523, "grad_norm": 1.2258422374725342, "learning_rate": 2.3931623931623934e-07, "loss": 0.0186, "step": 2284 }, { "epoch": 19.52991452991453, "grad_norm": 5.307450294494629, "learning_rate": 2.3504273504273505e-07, "loss": 0.1356, "step": 2285 }, { "epoch": 19.53846153846154, "grad_norm": 2.0854647159576416, "learning_rate": 2.307692307692308e-07, "loss": 0.0533, "step": 2286 }, { "epoch": 19.54700854700855, "grad_norm": 1.8560184240341187, "learning_rate": 2.264957264957265e-07, "loss": 0.048, "step": 2287 }, { "epoch": 19.555555555555557, "grad_norm": 5.781933307647705, "learning_rate": 2.2222222222222224e-07, "loss": 0.2769, "step": 2288 }, { "epoch": 19.564102564102566, "grad_norm": 4.858759880065918, "learning_rate": 2.1794871794871795e-07, "loss": 0.4217, "step": 2289 }, { "epoch": 19.572649572649574, "grad_norm": 3.7598235607147217, "learning_rate": 2.136752136752137e-07, "loss": 0.162, "step": 2290 }, { "epoch": 19.581196581196583, "grad_norm": 0.5706556439399719, "learning_rate": 2.094017094017094e-07, "loss": 0.0151, "step": 2291 }, { "epoch": 19.58974358974359, "grad_norm": 5.697900295257568, "learning_rate": 2.0512820512820514e-07, "loss": 0.1015, "step": 2292 }, { "epoch": 19.5982905982906, "grad_norm": 4.635442733764648, "learning_rate": 2.0085470085470088e-07, "loss": 0.1827, "step": 2293 }, { "epoch": 19.60683760683761, "grad_norm": 3.070131778717041, "learning_rate": 1.9658119658119658e-07, "loss": 0.0802, "step": 2294 }, { "epoch": 19.615384615384617, "grad_norm": 0.979217529296875, "learning_rate": 1.9230769230769234e-07, "loss": 0.0237, "step": 2295 }, { "epoch": 19.623931623931625, "grad_norm": 5.640648365020752, "learning_rate": 1.8803418803418804e-07, "loss": 0.0588, "step": 2296 }, { "epoch": 19.632478632478634, "grad_norm": 7.1512861251831055, "learning_rate": 1.8376068376068378e-07, "loss": 0.1942, "step": 2297 }, { "epoch": 19.641025641025642, "grad_norm": 12.868803024291992, "learning_rate": 1.7948717948717948e-07, "loss": 0.2771, "step": 2298 }, { "epoch": 19.64957264957265, "grad_norm": 2.954000234603882, "learning_rate": 1.7521367521367524e-07, "loss": 0.1124, "step": 2299 }, { "epoch": 19.65811965811966, "grad_norm": 0.47206825017929077, "learning_rate": 1.7094017094017097e-07, "loss": 0.0104, "step": 2300 }, { "epoch": 19.666666666666668, "grad_norm": 0.6243001818656921, "learning_rate": 1.6666666666666668e-07, "loss": 0.0145, "step": 2301 }, { "epoch": 19.675213675213676, "grad_norm": 1.6680350303649902, "learning_rate": 1.623931623931624e-07, "loss": 0.0634, "step": 2302 }, { "epoch": 19.683760683760685, "grad_norm": 6.298573017120361, "learning_rate": 1.5811965811965811e-07, "loss": 0.2083, "step": 2303 }, { "epoch": 19.692307692307693, "grad_norm": 0.622466504573822, "learning_rate": 1.5384615384615387e-07, "loss": 0.0155, "step": 2304 }, { "epoch": 19.700854700854702, "grad_norm": 2.289080858230591, "learning_rate": 1.495726495726496e-07, "loss": 0.0698, "step": 2305 }, { "epoch": 19.70940170940171, "grad_norm": 13.065472602844238, "learning_rate": 1.452991452991453e-07, "loss": 0.2587, "step": 2306 }, { "epoch": 19.71794871794872, "grad_norm": 0.903513491153717, "learning_rate": 1.4102564102564104e-07, "loss": 0.0222, "step": 2307 }, { "epoch": 19.726495726495727, "grad_norm": 1.3763283491134644, "learning_rate": 1.3675213675213677e-07, "loss": 0.042, "step": 2308 }, { "epoch": 19.735042735042736, "grad_norm": 3.3493802547454834, "learning_rate": 1.3247863247863248e-07, "loss": 0.1042, "step": 2309 }, { "epoch": 19.743589743589745, "grad_norm": 12.862226486206055, "learning_rate": 1.282051282051282e-07, "loss": 0.359, "step": 2310 }, { "epoch": 19.752136752136753, "grad_norm": 5.56069278717041, "learning_rate": 1.2393162393162394e-07, "loss": 0.1645, "step": 2311 }, { "epoch": 19.76068376068376, "grad_norm": 2.900381326675415, "learning_rate": 1.1965811965811967e-07, "loss": 0.1641, "step": 2312 }, { "epoch": 19.76923076923077, "grad_norm": 1.3674333095550537, "learning_rate": 1.153846153846154e-07, "loss": 0.0428, "step": 2313 }, { "epoch": 19.77777777777778, "grad_norm": 2.06278657913208, "learning_rate": 1.1111111111111112e-07, "loss": 0.0404, "step": 2314 }, { "epoch": 19.786324786324787, "grad_norm": 5.760499954223633, "learning_rate": 1.0683760683760685e-07, "loss": 0.1298, "step": 2315 }, { "epoch": 19.794871794871796, "grad_norm": 3.2554516792297363, "learning_rate": 1.0256410256410257e-07, "loss": 0.0432, "step": 2316 }, { "epoch": 19.803418803418804, "grad_norm": 1.7984355688095093, "learning_rate": 9.829059829059829e-08, "loss": 0.0461, "step": 2317 }, { "epoch": 19.811965811965813, "grad_norm": 1.633736491203308, "learning_rate": 9.401709401709402e-08, "loss": 0.0746, "step": 2318 }, { "epoch": 19.82051282051282, "grad_norm": 2.6958866119384766, "learning_rate": 8.974358974358974e-08, "loss": 0.0852, "step": 2319 }, { "epoch": 19.82905982905983, "grad_norm": 0.9744161367416382, "learning_rate": 8.547008547008549e-08, "loss": 0.0368, "step": 2320 }, { "epoch": 19.837606837606838, "grad_norm": 1.2404037714004517, "learning_rate": 8.11965811965812e-08, "loss": 0.0547, "step": 2321 }, { "epoch": 19.846153846153847, "grad_norm": 1.6044564247131348, "learning_rate": 7.692307692307694e-08, "loss": 0.0441, "step": 2322 }, { "epoch": 19.854700854700855, "grad_norm": 0.47167596220970154, "learning_rate": 7.264957264957265e-08, "loss": 0.0099, "step": 2323 }, { "epoch": 19.863247863247864, "grad_norm": 1.6729376316070557, "learning_rate": 6.837606837606839e-08, "loss": 0.0258, "step": 2324 }, { "epoch": 19.871794871794872, "grad_norm": 0.5823857188224792, "learning_rate": 6.41025641025641e-08, "loss": 0.0131, "step": 2325 }, { "epoch": 19.88034188034188, "grad_norm": 4.055545806884766, "learning_rate": 5.982905982905984e-08, "loss": 0.073, "step": 2326 }, { "epoch": 19.88888888888889, "grad_norm": 2.693838596343994, "learning_rate": 5.555555555555556e-08, "loss": 0.0845, "step": 2327 }, { "epoch": 19.897435897435898, "grad_norm": 0.9895898103713989, "learning_rate": 5.1282051282051286e-08, "loss": 0.0205, "step": 2328 }, { "epoch": 19.905982905982906, "grad_norm": 3.560816526412964, "learning_rate": 4.700854700854701e-08, "loss": 0.0989, "step": 2329 }, { "epoch": 19.914529914529915, "grad_norm": 5.152528762817383, "learning_rate": 4.273504273504274e-08, "loss": 0.0133, "step": 2330 }, { "epoch": 19.923076923076923, "grad_norm": 1.709021806716919, "learning_rate": 3.846153846153847e-08, "loss": 0.068, "step": 2331 }, { "epoch": 19.931623931623932, "grad_norm": 0.4786951541900635, "learning_rate": 3.418803418803419e-08, "loss": 0.0141, "step": 2332 }, { "epoch": 19.94017094017094, "grad_norm": 1.5413727760314941, "learning_rate": 2.991452991452992e-08, "loss": 0.0246, "step": 2333 }, { "epoch": 19.94871794871795, "grad_norm": 1.019601583480835, "learning_rate": 2.5641025641025643e-08, "loss": 0.0199, "step": 2334 }, { "epoch": 19.957264957264957, "grad_norm": 1.6115524768829346, "learning_rate": 2.136752136752137e-08, "loss": 0.0752, "step": 2335 }, { "epoch": 19.965811965811966, "grad_norm": 2.381624698638916, "learning_rate": 1.7094017094017096e-08, "loss": 0.0609, "step": 2336 }, { "epoch": 19.974358974358974, "grad_norm": 1.688704013824463, "learning_rate": 1.2820512820512822e-08, "loss": 0.0419, "step": 2337 }, { "epoch": 19.982905982905983, "grad_norm": 1.643002986907959, "learning_rate": 8.547008547008548e-09, "loss": 0.0456, "step": 2338 }, { "epoch": 19.99145299145299, "grad_norm": 3.5371882915496826, "learning_rate": 4.273504273504274e-09, "loss": 0.0392, "step": 2339 }, { "epoch": 20.0, "grad_norm": 4.692568302154541, "learning_rate": 0.0, "loss": 0.1751, "step": 2340 }, { "epoch": 20.0, "eval_loss": 0.051427390426397324, "eval_runtime": 9.301, "eval_samples_per_second": 50.102, "eval_steps_per_second": 6.343, "step": 2340 } ], "logging_steps": 1, "max_steps": 2340, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 560912565657600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }