| { |
| "best_metric": 0.051427390426397324, |
| "best_model_checkpoint": "time_base/checkpoint-2340", |
| "epoch": 20.0, |
| "eval_steps": 500, |
| "global_step": 2340, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.008547008547008548, |
| "grad_norm": 221.6373748779297, |
| "learning_rate": 9.995726495726496e-06, |
| "loss": 37.5765, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.017094017094017096, |
| "grad_norm": 219.50563049316406, |
| "learning_rate": 9.991452991452993e-06, |
| "loss": 38.6173, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.02564102564102564, |
| "grad_norm": 180.23829650878906, |
| "learning_rate": 9.987179487179488e-06, |
| "loss": 40.3853, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.03418803418803419, |
| "grad_norm": 166.3365478515625, |
| "learning_rate": 9.982905982905984e-06, |
| "loss": 35.9724, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.042735042735042736, |
| "grad_norm": 199.6571044921875, |
| "learning_rate": 9.97863247863248e-06, |
| "loss": 35.0186, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.05128205128205128, |
| "grad_norm": 180.9748992919922, |
| "learning_rate": 9.974358974358974e-06, |
| "loss": 39.3679, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.05982905982905983, |
| "grad_norm": 200.05496215820312, |
| "learning_rate": 9.970085470085471e-06, |
| "loss": 37.1519, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06837606837606838, |
| "grad_norm": 154.3177032470703, |
| "learning_rate": 9.965811965811966e-06, |
| "loss": 33.9309, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.07692307692307693, |
| "grad_norm": 198.05914306640625, |
| "learning_rate": 9.961538461538463e-06, |
| "loss": 34.8814, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.08547008547008547, |
| "grad_norm": 168.3035430908203, |
| "learning_rate": 9.957264957264958e-06, |
| "loss": 33.184, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.09401709401709402, |
| "grad_norm": 201.83705139160156, |
| "learning_rate": 9.952991452991455e-06, |
| "loss": 35.4025, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.10256410256410256, |
| "grad_norm": 224.4587860107422, |
| "learning_rate": 9.94871794871795e-06, |
| "loss": 39.222, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 192.1949005126953, |
| "learning_rate": 9.944444444444445e-06, |
| "loss": 37.1982, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.11965811965811966, |
| "grad_norm": 193.05662536621094, |
| "learning_rate": 9.940170940170942e-06, |
| "loss": 38.1325, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.1282051282051282, |
| "grad_norm": 150.61575317382812, |
| "learning_rate": 9.935897435897437e-06, |
| "loss": 34.8682, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.13675213675213677, |
| "grad_norm": 170.1510772705078, |
| "learning_rate": 9.931623931623933e-06, |
| "loss": 33.3652, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.1452991452991453, |
| "grad_norm": 193.86875915527344, |
| "learning_rate": 9.927350427350428e-06, |
| "loss": 35.0785, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.15384615384615385, |
| "grad_norm": 164.41986083984375, |
| "learning_rate": 9.923076923076923e-06, |
| "loss": 31.9719, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.1623931623931624, |
| "grad_norm": 166.08953857421875, |
| "learning_rate": 9.91880341880342e-06, |
| "loss": 34.5398, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.17094017094017094, |
| "grad_norm": 152.2139892578125, |
| "learning_rate": 9.914529914529915e-06, |
| "loss": 36.9092, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1794871794871795, |
| "grad_norm": 198.23095703125, |
| "learning_rate": 9.910256410256412e-06, |
| "loss": 35.6744, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.18803418803418803, |
| "grad_norm": 174.7784881591797, |
| "learning_rate": 9.905982905982907e-06, |
| "loss": 32.8258, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.19658119658119658, |
| "grad_norm": 133.69859313964844, |
| "learning_rate": 9.901709401709402e-06, |
| "loss": 31.431, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.20512820512820512, |
| "grad_norm": 217.17169189453125, |
| "learning_rate": 9.897435897435899e-06, |
| "loss": 38.5649, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.21367521367521367, |
| "grad_norm": 172.4914093017578, |
| "learning_rate": 9.893162393162394e-06, |
| "loss": 33.9858, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 186.39654541015625, |
| "learning_rate": 9.88888888888889e-06, |
| "loss": 32.8029, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.23076923076923078, |
| "grad_norm": 183.65159606933594, |
| "learning_rate": 9.884615384615386e-06, |
| "loss": 35.8633, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.23931623931623933, |
| "grad_norm": 228.352294921875, |
| "learning_rate": 9.880341880341882e-06, |
| "loss": 35.0285, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.24786324786324787, |
| "grad_norm": 156.77906799316406, |
| "learning_rate": 9.876068376068377e-06, |
| "loss": 29.2608, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.2564102564102564, |
| "grad_norm": 232.8336181640625, |
| "learning_rate": 9.871794871794872e-06, |
| "loss": 35.0349, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.26495726495726496, |
| "grad_norm": 248.63247680664062, |
| "learning_rate": 9.86752136752137e-06, |
| "loss": 34.5067, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.27350427350427353, |
| "grad_norm": 183.5840606689453, |
| "learning_rate": 9.863247863247864e-06, |
| "loss": 30.4758, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.28205128205128205, |
| "grad_norm": 160.54530334472656, |
| "learning_rate": 9.858974358974361e-06, |
| "loss": 31.7959, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.2905982905982906, |
| "grad_norm": 199.88156127929688, |
| "learning_rate": 9.854700854700856e-06, |
| "loss": 35.6482, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.29914529914529914, |
| "grad_norm": 272.9530029296875, |
| "learning_rate": 9.850427350427351e-06, |
| "loss": 33.0804, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.3076923076923077, |
| "grad_norm": 200.0990447998047, |
| "learning_rate": 9.846153846153848e-06, |
| "loss": 33.2675, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.3162393162393162, |
| "grad_norm": 202.014404296875, |
| "learning_rate": 9.841880341880343e-06, |
| "loss": 30.8991, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.3247863247863248, |
| "grad_norm": 181.14865112304688, |
| "learning_rate": 9.837606837606838e-06, |
| "loss": 32.3643, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 134.43423461914062, |
| "learning_rate": 9.833333333333333e-06, |
| "loss": 30.8094, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3418803418803419, |
| "grad_norm": 155.96640014648438, |
| "learning_rate": 9.82905982905983e-06, |
| "loss": 31.7564, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3504273504273504, |
| "grad_norm": 146.9285888671875, |
| "learning_rate": 9.824786324786325e-06, |
| "loss": 31.9905, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.358974358974359, |
| "grad_norm": 159.67974853515625, |
| "learning_rate": 9.820512820512821e-06, |
| "loss": 32.5029, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.36752136752136755, |
| "grad_norm": 172.4975128173828, |
| "learning_rate": 9.816239316239316e-06, |
| "loss": 31.2049, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.37606837606837606, |
| "grad_norm": 148.97573852539062, |
| "learning_rate": 9.811965811965812e-06, |
| "loss": 27.1673, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.38461538461538464, |
| "grad_norm": 115.93009185791016, |
| "learning_rate": 9.807692307692308e-06, |
| "loss": 30.3342, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.39316239316239315, |
| "grad_norm": 184.13145446777344, |
| "learning_rate": 9.803418803418803e-06, |
| "loss": 32.317, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.4017094017094017, |
| "grad_norm": 139.3995361328125, |
| "learning_rate": 9.7991452991453e-06, |
| "loss": 29.9643, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.41025641025641024, |
| "grad_norm": 184.97996520996094, |
| "learning_rate": 9.794871794871795e-06, |
| "loss": 30.6427, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.4188034188034188, |
| "grad_norm": 120.04417419433594, |
| "learning_rate": 9.790598290598292e-06, |
| "loss": 26.9772, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.42735042735042733, |
| "grad_norm": 183.2873077392578, |
| "learning_rate": 9.786324786324787e-06, |
| "loss": 31.6688, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4358974358974359, |
| "grad_norm": 206.44898986816406, |
| "learning_rate": 9.782051282051282e-06, |
| "loss": 32.0574, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 180.7601318359375, |
| "learning_rate": 9.777777777777779e-06, |
| "loss": 31.2178, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.452991452991453, |
| "grad_norm": 150.44012451171875, |
| "learning_rate": 9.773504273504274e-06, |
| "loss": 29.9826, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.46153846153846156, |
| "grad_norm": 119.02840423583984, |
| "learning_rate": 9.76923076923077e-06, |
| "loss": 26.876, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.4700854700854701, |
| "grad_norm": 164.58209228515625, |
| "learning_rate": 9.764957264957265e-06, |
| "loss": 28.1059, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.47863247863247865, |
| "grad_norm": 160.416259765625, |
| "learning_rate": 9.76068376068376e-06, |
| "loss": 28.7022, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.48717948717948717, |
| "grad_norm": 177.29747009277344, |
| "learning_rate": 9.756410256410257e-06, |
| "loss": 30.7275, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.49572649572649574, |
| "grad_norm": 153.59686279296875, |
| "learning_rate": 9.752136752136752e-06, |
| "loss": 28.5575, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.5042735042735043, |
| "grad_norm": 155.79617309570312, |
| "learning_rate": 9.747863247863249e-06, |
| "loss": 28.1139, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 173.02581787109375, |
| "learning_rate": 9.743589743589744e-06, |
| "loss": 30.4744, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5213675213675214, |
| "grad_norm": 125.31639862060547, |
| "learning_rate": 9.739316239316239e-06, |
| "loss": 26.5559, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.5299145299145299, |
| "grad_norm": 149.00302124023438, |
| "learning_rate": 9.735042735042736e-06, |
| "loss": 30.4065, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.5384615384615384, |
| "grad_norm": 101.76395416259766, |
| "learning_rate": 9.730769230769231e-06, |
| "loss": 25.8895, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.5470085470085471, |
| "grad_norm": 134.40159606933594, |
| "learning_rate": 9.726495726495728e-06, |
| "loss": 26.9317, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 151.01914978027344, |
| "learning_rate": 9.722222222222223e-06, |
| "loss": 27.9913, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.5641025641025641, |
| "grad_norm": 124.92068481445312, |
| "learning_rate": 9.71794871794872e-06, |
| "loss": 26.7874, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.5726495726495726, |
| "grad_norm": 131.29762268066406, |
| "learning_rate": 9.713675213675214e-06, |
| "loss": 27.4047, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5811965811965812, |
| "grad_norm": 154.37120056152344, |
| "learning_rate": 9.70940170940171e-06, |
| "loss": 26.6812, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.5897435897435898, |
| "grad_norm": 86.31095886230469, |
| "learning_rate": 9.705128205128206e-06, |
| "loss": 22.9869, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5982905982905983, |
| "grad_norm": 224.42613220214844, |
| "learning_rate": 9.700854700854701e-06, |
| "loss": 28.4812, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6068376068376068, |
| "grad_norm": 156.15228271484375, |
| "learning_rate": 9.696581196581198e-06, |
| "loss": 26.1761, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.6153846153846154, |
| "grad_norm": 117.7806167602539, |
| "learning_rate": 9.692307692307693e-06, |
| "loss": 20.7307, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.6239316239316239, |
| "grad_norm": 169.99154663085938, |
| "learning_rate": 9.688034188034188e-06, |
| "loss": 27.6369, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.6324786324786325, |
| "grad_norm": 98.81549072265625, |
| "learning_rate": 9.683760683760685e-06, |
| "loss": 24.5898, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.6410256410256411, |
| "grad_norm": 199.0179443359375, |
| "learning_rate": 9.67948717948718e-06, |
| "loss": 27.664, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.6495726495726496, |
| "grad_norm": 129.81033325195312, |
| "learning_rate": 9.675213675213677e-06, |
| "loss": 25.2547, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.6581196581196581, |
| "grad_norm": 140.1121826171875, |
| "learning_rate": 9.670940170940172e-06, |
| "loss": 27.4914, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 139.8365478515625, |
| "learning_rate": 9.666666666666667e-06, |
| "loss": 24.0178, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.6752136752136753, |
| "grad_norm": 147.24945068359375, |
| "learning_rate": 9.662393162393163e-06, |
| "loss": 27.1404, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6837606837606838, |
| "grad_norm": 165.67242431640625, |
| "learning_rate": 9.658119658119659e-06, |
| "loss": 25.6604, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6923076923076923, |
| "grad_norm": 114.36772918701172, |
| "learning_rate": 9.653846153846155e-06, |
| "loss": 24.3695, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.7008547008547008, |
| "grad_norm": 149.76258850097656, |
| "learning_rate": 9.64957264957265e-06, |
| "loss": 26.5265, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.7094017094017094, |
| "grad_norm": 121.9085693359375, |
| "learning_rate": 9.645299145299147e-06, |
| "loss": 25.7008, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.717948717948718, |
| "grad_norm": 106.49151611328125, |
| "learning_rate": 9.641025641025642e-06, |
| "loss": 20.5777, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.7264957264957265, |
| "grad_norm": 114.2357406616211, |
| "learning_rate": 9.636752136752137e-06, |
| "loss": 23.3429, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.7350427350427351, |
| "grad_norm": 107.38651275634766, |
| "learning_rate": 9.632478632478634e-06, |
| "loss": 24.6408, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.7435897435897436, |
| "grad_norm": 120.4283218383789, |
| "learning_rate": 9.628205128205129e-06, |
| "loss": 23.4563, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.7521367521367521, |
| "grad_norm": 165.21783447265625, |
| "learning_rate": 9.623931623931626e-06, |
| "loss": 25.878, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.7606837606837606, |
| "grad_norm": 105.8712387084961, |
| "learning_rate": 9.61965811965812e-06, |
| "loss": 23.605, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 147.31253051757812, |
| "learning_rate": 9.615384615384616e-06, |
| "loss": 24.537, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 127.37718963623047, |
| "learning_rate": 9.611111111111112e-06, |
| "loss": 24.6762, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.7863247863247863, |
| "grad_norm": 139.40553283691406, |
| "learning_rate": 9.606837606837607e-06, |
| "loss": 23.6076, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.7948717948717948, |
| "grad_norm": 218.39170837402344, |
| "learning_rate": 9.602564102564104e-06, |
| "loss": 25.2559, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.8034188034188035, |
| "grad_norm": 115.83401489257812, |
| "learning_rate": 9.5982905982906e-06, |
| "loss": 23.6758, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.811965811965812, |
| "grad_norm": 115.8538818359375, |
| "learning_rate": 9.594017094017094e-06, |
| "loss": 24.2789, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "grad_norm": 122.31534576416016, |
| "learning_rate": 9.589743589743591e-06, |
| "loss": 23.5114, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.8290598290598291, |
| "grad_norm": 171.58558654785156, |
| "learning_rate": 9.585470085470086e-06, |
| "loss": 24.7028, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.8376068376068376, |
| "grad_norm": 113.29806518554688, |
| "learning_rate": 9.581196581196583e-06, |
| "loss": 24.9667, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.8461538461538461, |
| "grad_norm": 183.74928283691406, |
| "learning_rate": 9.576923076923078e-06, |
| "loss": 24.7776, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.8547008547008547, |
| "grad_norm": 139.84701538085938, |
| "learning_rate": 9.572649572649575e-06, |
| "loss": 22.1558, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8632478632478633, |
| "grad_norm": 145.9014129638672, |
| "learning_rate": 9.56837606837607e-06, |
| "loss": 23.0282, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.8717948717948718, |
| "grad_norm": 195.9859619140625, |
| "learning_rate": 9.564102564102565e-06, |
| "loss": 23.7194, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.8803418803418803, |
| "grad_norm": 70.51985168457031, |
| "learning_rate": 9.559829059829061e-06, |
| "loss": 16.9605, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 184.04209899902344, |
| "learning_rate": 9.555555555555556e-06, |
| "loss": 23.4229, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.8974358974358975, |
| "grad_norm": 177.86727905273438, |
| "learning_rate": 9.551282051282053e-06, |
| "loss": 23.6004, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.905982905982906, |
| "grad_norm": 154.30784606933594, |
| "learning_rate": 9.547008547008548e-06, |
| "loss": 21.6725, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.9145299145299145, |
| "grad_norm": 104.27069854736328, |
| "learning_rate": 9.542735042735043e-06, |
| "loss": 22.856, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.9230769230769231, |
| "grad_norm": 157.4270477294922, |
| "learning_rate": 9.53846153846154e-06, |
| "loss": 24.398, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.9316239316239316, |
| "grad_norm": 123.56739807128906, |
| "learning_rate": 9.534188034188035e-06, |
| "loss": 20.6925, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.9401709401709402, |
| "grad_norm": 106.64054870605469, |
| "learning_rate": 9.52991452991453e-06, |
| "loss": 23.5794, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.9487179487179487, |
| "grad_norm": 88.68234252929688, |
| "learning_rate": 9.525641025641025e-06, |
| "loss": 20.729, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.9572649572649573, |
| "grad_norm": 91.86422729492188, |
| "learning_rate": 9.521367521367522e-06, |
| "loss": 18.7701, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.9658119658119658, |
| "grad_norm": 118.74354553222656, |
| "learning_rate": 9.517094017094017e-06, |
| "loss": 20.8439, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.9743589743589743, |
| "grad_norm": 120.72904968261719, |
| "learning_rate": 9.512820512820514e-06, |
| "loss": 21.1903, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.9829059829059829, |
| "grad_norm": 107.36665344238281, |
| "learning_rate": 9.508547008547009e-06, |
| "loss": 19.3457, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.9914529914529915, |
| "grad_norm": 131.74441528320312, |
| "learning_rate": 9.504273504273504e-06, |
| "loss": 21.4035, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 161.97703552246094, |
| "learning_rate": 9.5e-06, |
| "loss": 22.3831, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 17.230430603027344, |
| "eval_runtime": 9.9187, |
| "eval_samples_per_second": 46.982, |
| "eval_steps_per_second": 5.948, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.0085470085470085, |
| "grad_norm": 109.44770050048828, |
| "learning_rate": 9.495726495726496e-06, |
| "loss": 20.3406, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.017094017094017, |
| "grad_norm": 96.50030517578125, |
| "learning_rate": 9.491452991452992e-06, |
| "loss": 19.8086, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.0256410256410255, |
| "grad_norm": 91.27509307861328, |
| "learning_rate": 9.487179487179487e-06, |
| "loss": 18.8737, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.0341880341880343, |
| "grad_norm": 123.94478607177734, |
| "learning_rate": 9.482905982905984e-06, |
| "loss": 20.1785, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.0427350427350428, |
| "grad_norm": 109.29426574707031, |
| "learning_rate": 9.478632478632479e-06, |
| "loss": 18.8151, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.0512820512820513, |
| "grad_norm": 104.0233383178711, |
| "learning_rate": 9.474358974358974e-06, |
| "loss": 19.6281, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.0598290598290598, |
| "grad_norm": 75.7523193359375, |
| "learning_rate": 9.470085470085471e-06, |
| "loss": 18.5031, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.0683760683760684, |
| "grad_norm": 103.1374740600586, |
| "learning_rate": 9.465811965811966e-06, |
| "loss": 19.6443, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.0769230769230769, |
| "grad_norm": 92.68035888671875, |
| "learning_rate": 9.461538461538463e-06, |
| "loss": 19.7327, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.0854700854700854, |
| "grad_norm": 88.10079193115234, |
| "learning_rate": 9.457264957264958e-06, |
| "loss": 17.8832, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.0940170940170941, |
| "grad_norm": 80.04244232177734, |
| "learning_rate": 9.452991452991453e-06, |
| "loss": 16.4485, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.1025641025641026, |
| "grad_norm": 81.02445983886719, |
| "learning_rate": 9.44871794871795e-06, |
| "loss": 17.3035, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 98.95979309082031, |
| "learning_rate": 9.444444444444445e-06, |
| "loss": 17.5734, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.1196581196581197, |
| "grad_norm": 109.76984405517578, |
| "learning_rate": 9.440170940170941e-06, |
| "loss": 20.3985, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.1282051282051282, |
| "grad_norm": 98.52857208251953, |
| "learning_rate": 9.435897435897436e-06, |
| "loss": 17.7275, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.1367521367521367, |
| "grad_norm": 91.28802490234375, |
| "learning_rate": 9.431623931623931e-06, |
| "loss": 17.9107, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.1452991452991452, |
| "grad_norm": 92.89081573486328, |
| "learning_rate": 9.427350427350428e-06, |
| "loss": 18.2876, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.1538461538461537, |
| "grad_norm": 78.9795150756836, |
| "learning_rate": 9.423076923076923e-06, |
| "loss": 15.5738, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.1623931623931625, |
| "grad_norm": 83.77166748046875, |
| "learning_rate": 9.41880341880342e-06, |
| "loss": 16.0825, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.170940170940171, |
| "grad_norm": 129.62966918945312, |
| "learning_rate": 9.414529914529915e-06, |
| "loss": 18.4077, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.1794871794871795, |
| "grad_norm": 110.26199340820312, |
| "learning_rate": 9.410256410256412e-06, |
| "loss": 17.6436, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.188034188034188, |
| "grad_norm": 95.36865997314453, |
| "learning_rate": 9.405982905982907e-06, |
| "loss": 19.0424, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.1965811965811965, |
| "grad_norm": 98.36263275146484, |
| "learning_rate": 9.401709401709402e-06, |
| "loss": 16.6122, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.205128205128205, |
| "grad_norm": 83.68401336669922, |
| "learning_rate": 9.397435897435899e-06, |
| "loss": 14.9218, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.2136752136752136, |
| "grad_norm": 92.4602279663086, |
| "learning_rate": 9.393162393162394e-06, |
| "loss": 16.3563, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 106.41629791259766, |
| "learning_rate": 9.38888888888889e-06, |
| "loss": 16.4447, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.2307692307692308, |
| "grad_norm": 97.70237731933594, |
| "learning_rate": 9.384615384615385e-06, |
| "loss": 16.8154, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.2393162393162394, |
| "grad_norm": 76.88361358642578, |
| "learning_rate": 9.38034188034188e-06, |
| "loss": 15.7116, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.2478632478632479, |
| "grad_norm": 104.20966339111328, |
| "learning_rate": 9.376068376068377e-06, |
| "loss": 15.2283, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.2564102564102564, |
| "grad_norm": 80.29965209960938, |
| "learning_rate": 9.371794871794872e-06, |
| "loss": 15.3238, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.264957264957265, |
| "grad_norm": 72.6979751586914, |
| "learning_rate": 9.367521367521369e-06, |
| "loss": 14.2293, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.2735042735042734, |
| "grad_norm": 80.29464721679688, |
| "learning_rate": 9.363247863247864e-06, |
| "loss": 11.9706, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.282051282051282, |
| "grad_norm": 97.80663299560547, |
| "learning_rate": 9.358974358974359e-06, |
| "loss": 14.3517, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.2905982905982907, |
| "grad_norm": 75.88921356201172, |
| "learning_rate": 9.354700854700856e-06, |
| "loss": 12.8289, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.2991452991452992, |
| "grad_norm": 75.71963500976562, |
| "learning_rate": 9.35042735042735e-06, |
| "loss": 15.2496, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.3076923076923077, |
| "grad_norm": 84.5454330444336, |
| "learning_rate": 9.346153846153847e-06, |
| "loss": 15.7946, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.3162393162393162, |
| "grad_norm": 92.24919128417969, |
| "learning_rate": 9.341880341880343e-06, |
| "loss": 13.2751, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.3247863247863247, |
| "grad_norm": 76.51255798339844, |
| "learning_rate": 9.33760683760684e-06, |
| "loss": 14.1861, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 74.59149169921875, |
| "learning_rate": 9.333333333333334e-06, |
| "loss": 12.1881, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.341880341880342, |
| "grad_norm": 69.84959411621094, |
| "learning_rate": 9.32905982905983e-06, |
| "loss": 13.1244, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.3504273504273505, |
| "grad_norm": 82.09815979003906, |
| "learning_rate": 9.324786324786326e-06, |
| "loss": 12.7492, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.358974358974359, |
| "grad_norm": 87.25080108642578, |
| "learning_rate": 9.320512820512821e-06, |
| "loss": 15.5268, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.3675213675213675, |
| "grad_norm": 51.60975646972656, |
| "learning_rate": 9.316239316239318e-06, |
| "loss": 10.9868, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.376068376068376, |
| "grad_norm": 65.10023498535156, |
| "learning_rate": 9.311965811965813e-06, |
| "loss": 13.2106, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.3846153846153846, |
| "grad_norm": 86.3865737915039, |
| "learning_rate": 9.307692307692308e-06, |
| "loss": 12.4873, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.393162393162393, |
| "grad_norm": 89.5868911743164, |
| "learning_rate": 9.303418803418805e-06, |
| "loss": 12.3125, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.4017094017094016, |
| "grad_norm": 87.308837890625, |
| "learning_rate": 9.2991452991453e-06, |
| "loss": 13.1855, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.4102564102564101, |
| "grad_norm": 79.86372375488281, |
| "learning_rate": 9.294871794871796e-06, |
| "loss": 11.2756, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.4188034188034189, |
| "grad_norm": 64.42597961425781, |
| "learning_rate": 9.290598290598292e-06, |
| "loss": 11.7395, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.4273504273504274, |
| "grad_norm": 64.65245056152344, |
| "learning_rate": 9.286324786324787e-06, |
| "loss": 10.2739, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.435897435897436, |
| "grad_norm": 49.57310104370117, |
| "learning_rate": 9.282051282051283e-06, |
| "loss": 11.4798, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 89.93653106689453, |
| "learning_rate": 9.277777777777778e-06, |
| "loss": 13.8041, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.452991452991453, |
| "grad_norm": 59.6973876953125, |
| "learning_rate": 9.273504273504275e-06, |
| "loss": 11.0414, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.4615384615384617, |
| "grad_norm": 63.07640838623047, |
| "learning_rate": 9.26923076923077e-06, |
| "loss": 10.2649, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.4700854700854702, |
| "grad_norm": 121.3633041381836, |
| "learning_rate": 9.264957264957267e-06, |
| "loss": 11.9233, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.4786324786324787, |
| "grad_norm": 50.96989822387695, |
| "learning_rate": 9.260683760683762e-06, |
| "loss": 8.3527, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.4871794871794872, |
| "grad_norm": 71.61744689941406, |
| "learning_rate": 9.256410256410257e-06, |
| "loss": 11.4237, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.4957264957264957, |
| "grad_norm": 69.43048858642578, |
| "learning_rate": 9.252136752136754e-06, |
| "loss": 9.9193, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.5042735042735043, |
| "grad_norm": 130.2714385986328, |
| "learning_rate": 9.247863247863249e-06, |
| "loss": 12.0676, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.5128205128205128, |
| "grad_norm": 51.40456008911133, |
| "learning_rate": 9.243589743589745e-06, |
| "loss": 9.2348, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.5213675213675213, |
| "grad_norm": 48.94670486450195, |
| "learning_rate": 9.23931623931624e-06, |
| "loss": 8.8217, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.5299145299145298, |
| "grad_norm": 54.54533386230469, |
| "learning_rate": 9.235042735042736e-06, |
| "loss": 9.2478, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 46.581939697265625, |
| "learning_rate": 9.230769230769232e-06, |
| "loss": 8.746, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.547008547008547, |
| "grad_norm": 49.31954574584961, |
| "learning_rate": 9.226495726495727e-06, |
| "loss": 8.7889, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 48.5145378112793, |
| "learning_rate": 9.222222222222224e-06, |
| "loss": 8.4478, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.564102564102564, |
| "grad_norm": 49.587825775146484, |
| "learning_rate": 9.217948717948717e-06, |
| "loss": 10.5022, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.5726495726495726, |
| "grad_norm": 47.89423751831055, |
| "learning_rate": 9.213675213675214e-06, |
| "loss": 8.7681, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.5811965811965814, |
| "grad_norm": 59.971920013427734, |
| "learning_rate": 9.20940170940171e-06, |
| "loss": 9.6469, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.5897435897435899, |
| "grad_norm": 41.139957427978516, |
| "learning_rate": 9.205128205128206e-06, |
| "loss": 8.5196, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.5982905982905984, |
| "grad_norm": 36.8078498840332, |
| "learning_rate": 9.200854700854701e-06, |
| "loss": 8.2513, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.606837606837607, |
| "grad_norm": 62.23011016845703, |
| "learning_rate": 9.196581196581196e-06, |
| "loss": 9.239, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.6153846153846154, |
| "grad_norm": 41.35377502441406, |
| "learning_rate": 9.192307692307693e-06, |
| "loss": 8.6788, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.623931623931624, |
| "grad_norm": 53.734134674072266, |
| "learning_rate": 9.188034188034188e-06, |
| "loss": 8.2624, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.6324786324786325, |
| "grad_norm": 60.738887786865234, |
| "learning_rate": 9.183760683760685e-06, |
| "loss": 9.2777, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.641025641025641, |
| "grad_norm": 26.411643981933594, |
| "learning_rate": 9.17948717948718e-06, |
| "loss": 7.6894, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.6495726495726495, |
| "grad_norm": 37.81135940551758, |
| "learning_rate": 9.175213675213676e-06, |
| "loss": 8.009, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.658119658119658, |
| "grad_norm": 42.451080322265625, |
| "learning_rate": 9.170940170940171e-06, |
| "loss": 8.309, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 54.87519073486328, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 8.3505, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.6752136752136753, |
| "grad_norm": 47.997737884521484, |
| "learning_rate": 9.162393162393163e-06, |
| "loss": 8.9444, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.6837606837606838, |
| "grad_norm": 33.1911506652832, |
| "learning_rate": 9.158119658119658e-06, |
| "loss": 6.8856, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.6923076923076923, |
| "grad_norm": 28.42953872680664, |
| "learning_rate": 9.153846153846155e-06, |
| "loss": 7.0575, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.7008547008547008, |
| "grad_norm": 34.74330520629883, |
| "learning_rate": 9.14957264957265e-06, |
| "loss": 7.6837, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.7094017094017095, |
| "grad_norm": 27.730812072753906, |
| "learning_rate": 9.145299145299145e-06, |
| "loss": 7.2591, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.717948717948718, |
| "grad_norm": 36.658966064453125, |
| "learning_rate": 9.141025641025642e-06, |
| "loss": 7.6744, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.7264957264957266, |
| "grad_norm": 52.580074310302734, |
| "learning_rate": 9.136752136752137e-06, |
| "loss": 8.9746, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.735042735042735, |
| "grad_norm": 26.30430030822754, |
| "learning_rate": 9.132478632478634e-06, |
| "loss": 7.0829, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.7435897435897436, |
| "grad_norm": 35.77456283569336, |
| "learning_rate": 9.128205128205129e-06, |
| "loss": 7.46, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.7521367521367521, |
| "grad_norm": 46.80126953125, |
| "learning_rate": 9.123931623931624e-06, |
| "loss": 8.0331, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.7606837606837606, |
| "grad_norm": 26.510988235473633, |
| "learning_rate": 9.11965811965812e-06, |
| "loss": 7.0434, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.7692307692307692, |
| "grad_norm": 30.846357345581055, |
| "learning_rate": 9.115384615384615e-06, |
| "loss": 6.9022, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 45.06099319458008, |
| "learning_rate": 9.111111111111112e-06, |
| "loss": 7.108, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.7863247863247862, |
| "grad_norm": 40.050079345703125, |
| "learning_rate": 9.106837606837607e-06, |
| "loss": 7.3628, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.7948717948717947, |
| "grad_norm": 32.066261291503906, |
| "learning_rate": 9.102564102564104e-06, |
| "loss": 7.3292, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.8034188034188035, |
| "grad_norm": 29.196252822875977, |
| "learning_rate": 9.098290598290599e-06, |
| "loss": 6.6194, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.811965811965812, |
| "grad_norm": 34.54549026489258, |
| "learning_rate": 9.094017094017094e-06, |
| "loss": 7.224, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.8205128205128205, |
| "grad_norm": 31.863550186157227, |
| "learning_rate": 9.08974358974359e-06, |
| "loss": 7.141, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.8290598290598292, |
| "grad_norm": 36.79090118408203, |
| "learning_rate": 9.085470085470086e-06, |
| "loss": 6.9572, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.8376068376068377, |
| "grad_norm": 24.298635482788086, |
| "learning_rate": 9.081196581196583e-06, |
| "loss": 6.6881, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.8461538461538463, |
| "grad_norm": 16.75456428527832, |
| "learning_rate": 9.076923076923078e-06, |
| "loss": 6.4055, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.8547008547008548, |
| "grad_norm": 20.152400970458984, |
| "learning_rate": 9.072649572649573e-06, |
| "loss": 6.9078, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.8632478632478633, |
| "grad_norm": 34.73337173461914, |
| "learning_rate": 9.06837606837607e-06, |
| "loss": 6.7923, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.8717948717948718, |
| "grad_norm": 28.418310165405273, |
| "learning_rate": 9.064102564102564e-06, |
| "loss": 6.9382, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.8803418803418803, |
| "grad_norm": 13.454174995422363, |
| "learning_rate": 9.059829059829061e-06, |
| "loss": 4.5504, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 20.746938705444336, |
| "learning_rate": 9.055555555555556e-06, |
| "loss": 6.4711, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.8974358974358974, |
| "grad_norm": 23.29437828063965, |
| "learning_rate": 9.051282051282051e-06, |
| "loss": 6.1381, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.9059829059829059, |
| "grad_norm": 31.720672607421875, |
| "learning_rate": 9.047008547008548e-06, |
| "loss": 6.7716, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.9145299145299144, |
| "grad_norm": 16.971572875976562, |
| "learning_rate": 9.042735042735043e-06, |
| "loss": 6.4734, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 25.185396194458008, |
| "learning_rate": 9.03846153846154e-06, |
| "loss": 6.2505, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.9316239316239316, |
| "grad_norm": 42.373863220214844, |
| "learning_rate": 9.034188034188035e-06, |
| "loss": 7.1968, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.9401709401709402, |
| "grad_norm": 21.06004524230957, |
| "learning_rate": 9.029914529914532e-06, |
| "loss": 6.082, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.9487179487179487, |
| "grad_norm": 21.413599014282227, |
| "learning_rate": 9.025641025641027e-06, |
| "loss": 6.2279, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.9572649572649574, |
| "grad_norm": 18.379974365234375, |
| "learning_rate": 9.021367521367522e-06, |
| "loss": 6.6032, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.965811965811966, |
| "grad_norm": 28.239042282104492, |
| "learning_rate": 9.017094017094018e-06, |
| "loss": 6.5428, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.9743589743589745, |
| "grad_norm": 17.92879867553711, |
| "learning_rate": 9.012820512820513e-06, |
| "loss": 5.986, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.982905982905983, |
| "grad_norm": 15.501392364501953, |
| "learning_rate": 9.00854700854701e-06, |
| "loss": 5.9526, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.9914529914529915, |
| "grad_norm": 23.742633819580078, |
| "learning_rate": 9.004273504273505e-06, |
| "loss": 6.2462, |
| "step": 233 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 28.22560691833496, |
| "learning_rate": 9e-06, |
| "loss": 5.8705, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 5.379393577575684, |
| "eval_runtime": 9.2791, |
| "eval_samples_per_second": 50.22, |
| "eval_steps_per_second": 6.358, |
| "step": 234 |
| }, |
| { |
| "epoch": 2.0085470085470085, |
| "grad_norm": 21.7072696685791, |
| "learning_rate": 8.995726495726497e-06, |
| "loss": 6.2757, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.017094017094017, |
| "grad_norm": 20.955190658569336, |
| "learning_rate": 8.991452991452992e-06, |
| "loss": 5.7265, |
| "step": 236 |
| }, |
| { |
| "epoch": 2.0256410256410255, |
| "grad_norm": 15.186567306518555, |
| "learning_rate": 8.987179487179489e-06, |
| "loss": 6.1958, |
| "step": 237 |
| }, |
| { |
| "epoch": 2.034188034188034, |
| "grad_norm": 20.938766479492188, |
| "learning_rate": 8.982905982905984e-06, |
| "loss": 6.2317, |
| "step": 238 |
| }, |
| { |
| "epoch": 2.0427350427350426, |
| "grad_norm": 18.457494735717773, |
| "learning_rate": 8.978632478632479e-06, |
| "loss": 6.4711, |
| "step": 239 |
| }, |
| { |
| "epoch": 2.051282051282051, |
| "grad_norm": 43.505149841308594, |
| "learning_rate": 8.974358974358976e-06, |
| "loss": 5.9632, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.0598290598290596, |
| "grad_norm": 15.558544158935547, |
| "learning_rate": 8.97008547008547e-06, |
| "loss": 5.8099, |
| "step": 241 |
| }, |
| { |
| "epoch": 2.0683760683760686, |
| "grad_norm": 22.20660400390625, |
| "learning_rate": 8.965811965811967e-06, |
| "loss": 5.7939, |
| "step": 242 |
| }, |
| { |
| "epoch": 2.076923076923077, |
| "grad_norm": 15.866617202758789, |
| "learning_rate": 8.961538461538462e-06, |
| "loss": 5.9473, |
| "step": 243 |
| }, |
| { |
| "epoch": 2.0854700854700856, |
| "grad_norm": 20.30729103088379, |
| "learning_rate": 8.957264957264959e-06, |
| "loss": 6.2028, |
| "step": 244 |
| }, |
| { |
| "epoch": 2.094017094017094, |
| "grad_norm": 15.517614364624023, |
| "learning_rate": 8.952991452991454e-06, |
| "loss": 5.906, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.1025641025641026, |
| "grad_norm": 21.30764389038086, |
| "learning_rate": 8.94871794871795e-06, |
| "loss": 6.1907, |
| "step": 246 |
| }, |
| { |
| "epoch": 2.111111111111111, |
| "grad_norm": 19.973115921020508, |
| "learning_rate": 8.944444444444446e-06, |
| "loss": 5.6895, |
| "step": 247 |
| }, |
| { |
| "epoch": 2.1196581196581197, |
| "grad_norm": 17.40595817565918, |
| "learning_rate": 8.940170940170941e-06, |
| "loss": 5.4836, |
| "step": 248 |
| }, |
| { |
| "epoch": 2.128205128205128, |
| "grad_norm": 27.667421340942383, |
| "learning_rate": 8.935897435897438e-06, |
| "loss": 5.9082, |
| "step": 249 |
| }, |
| { |
| "epoch": 2.1367521367521367, |
| "grad_norm": 18.151315689086914, |
| "learning_rate": 8.931623931623933e-06, |
| "loss": 5.8102, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.1452991452991452, |
| "grad_norm": 15.390297889709473, |
| "learning_rate": 8.927350427350428e-06, |
| "loss": 5.5504, |
| "step": 251 |
| }, |
| { |
| "epoch": 2.1538461538461537, |
| "grad_norm": 17.257841110229492, |
| "learning_rate": 8.923076923076925e-06, |
| "loss": 5.9043, |
| "step": 252 |
| }, |
| { |
| "epoch": 2.1623931623931623, |
| "grad_norm": 19.2503604888916, |
| "learning_rate": 8.91880341880342e-06, |
| "loss": 5.8349, |
| "step": 253 |
| }, |
| { |
| "epoch": 2.1709401709401708, |
| "grad_norm": 25.236759185791016, |
| "learning_rate": 8.914529914529916e-06, |
| "loss": 5.2908, |
| "step": 254 |
| }, |
| { |
| "epoch": 2.1794871794871793, |
| "grad_norm": 13.771193504333496, |
| "learning_rate": 8.910256410256411e-06, |
| "loss": 5.4743, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.1880341880341883, |
| "grad_norm": 17.406471252441406, |
| "learning_rate": 8.905982905982906e-06, |
| "loss": 5.6856, |
| "step": 256 |
| }, |
| { |
| "epoch": 2.1965811965811968, |
| "grad_norm": 14.727091789245605, |
| "learning_rate": 8.901709401709401e-06, |
| "loss": 5.7937, |
| "step": 257 |
| }, |
| { |
| "epoch": 2.2051282051282053, |
| "grad_norm": 18.193246841430664, |
| "learning_rate": 8.897435897435898e-06, |
| "loss": 5.5704, |
| "step": 258 |
| }, |
| { |
| "epoch": 2.213675213675214, |
| "grad_norm": 21.573726654052734, |
| "learning_rate": 8.893162393162393e-06, |
| "loss": 5.479, |
| "step": 259 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 28.72640037536621, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 5.5096, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.230769230769231, |
| "grad_norm": 15.4992094039917, |
| "learning_rate": 8.884615384615385e-06, |
| "loss": 5.217, |
| "step": 261 |
| }, |
| { |
| "epoch": 2.2393162393162394, |
| "grad_norm": 17.753416061401367, |
| "learning_rate": 8.88034188034188e-06, |
| "loss": 5.8173, |
| "step": 262 |
| }, |
| { |
| "epoch": 2.247863247863248, |
| "grad_norm": 15.91961669921875, |
| "learning_rate": 8.876068376068377e-06, |
| "loss": 5.7171, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.2564102564102564, |
| "grad_norm": 23.30504035949707, |
| "learning_rate": 8.871794871794872e-06, |
| "loss": 5.6214, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.264957264957265, |
| "grad_norm": 15.583686828613281, |
| "learning_rate": 8.867521367521369e-06, |
| "loss": 5.2343, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.2735042735042734, |
| "grad_norm": 24.482046127319336, |
| "learning_rate": 8.863247863247864e-06, |
| "loss": 5.0747, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.282051282051282, |
| "grad_norm": 16.17924690246582, |
| "learning_rate": 8.858974358974359e-06, |
| "loss": 5.2645, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.2905982905982905, |
| "grad_norm": 19.538314819335938, |
| "learning_rate": 8.854700854700855e-06, |
| "loss": 5.3484, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.299145299145299, |
| "grad_norm": 14.472186088562012, |
| "learning_rate": 8.85042735042735e-06, |
| "loss": 5.8159, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 16.797805786132812, |
| "learning_rate": 8.846153846153847e-06, |
| "loss": 5.4466, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.316239316239316, |
| "grad_norm": 13.237580299377441, |
| "learning_rate": 8.841880341880342e-06, |
| "loss": 5.2189, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.324786324786325, |
| "grad_norm": 16.685317993164062, |
| "learning_rate": 8.837606837606837e-06, |
| "loss": 5.7098, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 16.63880729675293, |
| "learning_rate": 8.833333333333334e-06, |
| "loss": 5.0714, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.341880341880342, |
| "grad_norm": 20.871978759765625, |
| "learning_rate": 8.829059829059829e-06, |
| "loss": 4.9509, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.3504273504273505, |
| "grad_norm": 16.95268440246582, |
| "learning_rate": 8.824786324786326e-06, |
| "loss": 5.4166, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.358974358974359, |
| "grad_norm": 15.446279525756836, |
| "learning_rate": 8.820512820512821e-06, |
| "loss": 4.5967, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.3675213675213675, |
| "grad_norm": 17.148235321044922, |
| "learning_rate": 8.816239316239316e-06, |
| "loss": 5.2542, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.376068376068376, |
| "grad_norm": 17.014827728271484, |
| "learning_rate": 8.811965811965813e-06, |
| "loss": 5.4702, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.3846153846153846, |
| "grad_norm": 15.313383102416992, |
| "learning_rate": 8.807692307692308e-06, |
| "loss": 5.2119, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.393162393162393, |
| "grad_norm": 20.2298641204834, |
| "learning_rate": 8.803418803418804e-06, |
| "loss": 5.4064, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.4017094017094016, |
| "grad_norm": 14.982254981994629, |
| "learning_rate": 8.7991452991453e-06, |
| "loss": 5.2545, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.41025641025641, |
| "grad_norm": 16.258047103881836, |
| "learning_rate": 8.794871794871796e-06, |
| "loss": 5.0141, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.4188034188034186, |
| "grad_norm": 22.5199031829834, |
| "learning_rate": 8.790598290598291e-06, |
| "loss": 5.3486, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.427350427350427, |
| "grad_norm": 17.546480178833008, |
| "learning_rate": 8.786324786324786e-06, |
| "loss": 5.2785, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.435897435897436, |
| "grad_norm": 22.07866668701172, |
| "learning_rate": 8.782051282051283e-06, |
| "loss": 5.4471, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.4444444444444446, |
| "grad_norm": 409.2532043457031, |
| "learning_rate": 8.777777777777778e-06, |
| "loss": 6.0948, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.452991452991453, |
| "grad_norm": 185.7334747314453, |
| "learning_rate": 8.773504273504275e-06, |
| "loss": 5.5538, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.4615384615384617, |
| "grad_norm": 30.8182430267334, |
| "learning_rate": 8.76923076923077e-06, |
| "loss": 4.9661, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.47008547008547, |
| "grad_norm": 18.584409713745117, |
| "learning_rate": 8.764957264957265e-06, |
| "loss": 5.0947, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.4786324786324787, |
| "grad_norm": 18.128522872924805, |
| "learning_rate": 8.760683760683762e-06, |
| "loss": 4.8816, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.4871794871794872, |
| "grad_norm": 18.800090789794922, |
| "learning_rate": 8.756410256410257e-06, |
| "loss": 5.0952, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.4957264957264957, |
| "grad_norm": 22.140430450439453, |
| "learning_rate": 8.752136752136753e-06, |
| "loss": 4.5408, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.5042735042735043, |
| "grad_norm": 19.867111206054688, |
| "learning_rate": 8.747863247863248e-06, |
| "loss": 4.7435, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.5128205128205128, |
| "grad_norm": 19.437868118286133, |
| "learning_rate": 8.743589743589743e-06, |
| "loss": 5.2643, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.5213675213675213, |
| "grad_norm": 18.256561279296875, |
| "learning_rate": 8.73931623931624e-06, |
| "loss": 5.2531, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.52991452991453, |
| "grad_norm": 18.65209197998047, |
| "learning_rate": 8.735042735042735e-06, |
| "loss": 4.8646, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.5384615384615383, |
| "grad_norm": 14.704927444458008, |
| "learning_rate": 8.730769230769232e-06, |
| "loss": 4.8343, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.547008547008547, |
| "grad_norm": 15.522851943969727, |
| "learning_rate": 8.726495726495727e-06, |
| "loss": 4.898, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.5555555555555554, |
| "grad_norm": 21.7825927734375, |
| "learning_rate": 8.722222222222224e-06, |
| "loss": 5.0732, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 17.963552474975586, |
| "learning_rate": 8.717948717948719e-06, |
| "loss": 4.9684, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.5726495726495724, |
| "grad_norm": 16.14459991455078, |
| "learning_rate": 8.713675213675214e-06, |
| "loss": 4.8802, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.5811965811965814, |
| "grad_norm": 18.386646270751953, |
| "learning_rate": 8.70940170940171e-06, |
| "loss": 4.8837, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.58974358974359, |
| "grad_norm": 19.471376419067383, |
| "learning_rate": 8.705128205128206e-06, |
| "loss": 4.6325, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.5982905982905984, |
| "grad_norm": 17.839717864990234, |
| "learning_rate": 8.700854700854702e-06, |
| "loss": 4.7851, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.606837606837607, |
| "grad_norm": 26.519363403320312, |
| "learning_rate": 8.696581196581197e-06, |
| "loss": 5.0576, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.6153846153846154, |
| "grad_norm": 14.135244369506836, |
| "learning_rate": 8.692307692307692e-06, |
| "loss": 4.7719, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.623931623931624, |
| "grad_norm": 16.5241641998291, |
| "learning_rate": 8.68803418803419e-06, |
| "loss": 4.5826, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.6324786324786325, |
| "grad_norm": 23.982437133789062, |
| "learning_rate": 8.683760683760684e-06, |
| "loss": 4.4878, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.641025641025641, |
| "grad_norm": 16.036361694335938, |
| "learning_rate": 8.679487179487181e-06, |
| "loss": 4.3867, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.6495726495726495, |
| "grad_norm": 16.19298553466797, |
| "learning_rate": 8.675213675213676e-06, |
| "loss": 4.763, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.658119658119658, |
| "grad_norm": 19.32802963256836, |
| "learning_rate": 8.670940170940171e-06, |
| "loss": 4.4083, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 21.75898551940918, |
| "learning_rate": 8.666666666666668e-06, |
| "loss": 4.8782, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.6752136752136755, |
| "grad_norm": 905.6954956054688, |
| "learning_rate": 8.662393162393163e-06, |
| "loss": 5.7901, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.683760683760684, |
| "grad_norm": 21.126985549926758, |
| "learning_rate": 8.65811965811966e-06, |
| "loss": 4.918, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.6923076923076925, |
| "grad_norm": 22.190237045288086, |
| "learning_rate": 8.653846153846155e-06, |
| "loss": 4.4327, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.700854700854701, |
| "grad_norm": 90.69184875488281, |
| "learning_rate": 8.649572649572651e-06, |
| "loss": 5.1477, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.7094017094017095, |
| "grad_norm": 43.43864059448242, |
| "learning_rate": 8.645299145299146e-06, |
| "loss": 4.5476, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.717948717948718, |
| "grad_norm": 19.24538230895996, |
| "learning_rate": 8.641025641025641e-06, |
| "loss": 4.4304, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.7264957264957266, |
| "grad_norm": 21.809600830078125, |
| "learning_rate": 8.636752136752138e-06, |
| "loss": 4.4215, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.735042735042735, |
| "grad_norm": 21.406156539916992, |
| "learning_rate": 8.632478632478633e-06, |
| "loss": 4.5411, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.7435897435897436, |
| "grad_norm": 17.57236099243164, |
| "learning_rate": 8.62820512820513e-06, |
| "loss": 4.7952, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.752136752136752, |
| "grad_norm": 21.049169540405273, |
| "learning_rate": 8.623931623931625e-06, |
| "loss": 4.4596, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.7606837606837606, |
| "grad_norm": 20.04981803894043, |
| "learning_rate": 8.61965811965812e-06, |
| "loss": 4.4705, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.769230769230769, |
| "grad_norm": 21.146499633789062, |
| "learning_rate": 8.615384615384617e-06, |
| "loss": 4.6081, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 20.9805908203125, |
| "learning_rate": 8.611111111111112e-06, |
| "loss": 4.8387, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.786324786324786, |
| "grad_norm": 17.708343505859375, |
| "learning_rate": 8.606837606837609e-06, |
| "loss": 4.3455, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.7948717948717947, |
| "grad_norm": 25.657032012939453, |
| "learning_rate": 8.602564102564104e-06, |
| "loss": 4.3119, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.8034188034188032, |
| "grad_norm": 17.713972091674805, |
| "learning_rate": 8.598290598290599e-06, |
| "loss": 4.5597, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.8119658119658117, |
| "grad_norm": 22.297082901000977, |
| "learning_rate": 8.594017094017095e-06, |
| "loss": 3.8398, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 16.11454200744629, |
| "learning_rate": 8.58974358974359e-06, |
| "loss": 3.2049, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.8290598290598292, |
| "grad_norm": 27.323585510253906, |
| "learning_rate": 8.585470085470086e-06, |
| "loss": 4.0371, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.8376068376068377, |
| "grad_norm": 21.090797424316406, |
| "learning_rate": 8.58119658119658e-06, |
| "loss": 4.5193, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.8461538461538463, |
| "grad_norm": 39.087432861328125, |
| "learning_rate": 8.576923076923077e-06, |
| "loss": 4.3537, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.8547008547008548, |
| "grad_norm": 18.49846839904785, |
| "learning_rate": 8.572649572649572e-06, |
| "loss": 4.614, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.8632478632478633, |
| "grad_norm": 26.671632766723633, |
| "learning_rate": 8.568376068376069e-06, |
| "loss": 4.2224, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.871794871794872, |
| "grad_norm": 25.799545288085938, |
| "learning_rate": 8.564102564102564e-06, |
| "loss": 4.2209, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.8803418803418803, |
| "grad_norm": 20.131961822509766, |
| "learning_rate": 8.559829059829061e-06, |
| "loss": 4.5194, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 20.193859100341797, |
| "learning_rate": 8.555555555555556e-06, |
| "loss": 3.9966, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.8974358974358974, |
| "grad_norm": 20.06737518310547, |
| "learning_rate": 8.551282051282051e-06, |
| "loss": 3.7394, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.905982905982906, |
| "grad_norm": 438.34429931640625, |
| "learning_rate": 8.547008547008548e-06, |
| "loss": 5.1558, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.9145299145299144, |
| "grad_norm": 22.152528762817383, |
| "learning_rate": 8.542735042735043e-06, |
| "loss": 3.9014, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.9230769230769234, |
| "grad_norm": 29.279739379882812, |
| "learning_rate": 8.53846153846154e-06, |
| "loss": 4.0479, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.931623931623932, |
| "grad_norm": 26.182645797729492, |
| "learning_rate": 8.534188034188035e-06, |
| "loss": 4.2022, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.9401709401709404, |
| "grad_norm": 22.329736709594727, |
| "learning_rate": 8.52991452991453e-06, |
| "loss": 3.8777, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.948717948717949, |
| "grad_norm": 20.62833023071289, |
| "learning_rate": 8.525641025641026e-06, |
| "loss": 4.2189, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.9572649572649574, |
| "grad_norm": 20.176612854003906, |
| "learning_rate": 8.521367521367521e-06, |
| "loss": 4.0124, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.965811965811966, |
| "grad_norm": 18.77017593383789, |
| "learning_rate": 8.517094017094018e-06, |
| "loss": 3.3286, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.9743589743589745, |
| "grad_norm": 226.93701171875, |
| "learning_rate": 8.512820512820513e-06, |
| "loss": 4.6969, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.982905982905983, |
| "grad_norm": 675.1133422851562, |
| "learning_rate": 8.508547008547008e-06, |
| "loss": 4.6717, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.9914529914529915, |
| "grad_norm": 19.938486099243164, |
| "learning_rate": 8.504273504273505e-06, |
| "loss": 4.0103, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 15.917003631591797, |
| "learning_rate": 8.5e-06, |
| "loss": 3.1643, |
| "step": 351 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 3.4197537899017334, |
| "eval_runtime": 9.289, |
| "eval_samples_per_second": 50.167, |
| "eval_steps_per_second": 6.352, |
| "step": 351 |
| }, |
| { |
| "epoch": 3.0085470085470085, |
| "grad_norm": 22.22833251953125, |
| "learning_rate": 8.495726495726497e-06, |
| "loss": 4.3458, |
| "step": 352 |
| }, |
| { |
| "epoch": 3.017094017094017, |
| "grad_norm": 16.4627685546875, |
| "learning_rate": 8.491452991452992e-06, |
| "loss": 3.5374, |
| "step": 353 |
| }, |
| { |
| "epoch": 3.0256410256410255, |
| "grad_norm": 16.389379501342773, |
| "learning_rate": 8.487179487179488e-06, |
| "loss": 4.1384, |
| "step": 354 |
| }, |
| { |
| "epoch": 3.034188034188034, |
| "grad_norm": 19.589706420898438, |
| "learning_rate": 8.482905982905983e-06, |
| "loss": 3.9522, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.0427350427350426, |
| "grad_norm": 21.66250228881836, |
| "learning_rate": 8.478632478632479e-06, |
| "loss": 4.0197, |
| "step": 356 |
| }, |
| { |
| "epoch": 3.051282051282051, |
| "grad_norm": 42.1422119140625, |
| "learning_rate": 8.474358974358975e-06, |
| "loss": 3.9432, |
| "step": 357 |
| }, |
| { |
| "epoch": 3.0598290598290596, |
| "grad_norm": 23.0153751373291, |
| "learning_rate": 8.47008547008547e-06, |
| "loss": 3.9146, |
| "step": 358 |
| }, |
| { |
| "epoch": 3.0683760683760686, |
| "grad_norm": 20.847400665283203, |
| "learning_rate": 8.465811965811967e-06, |
| "loss": 3.9736, |
| "step": 359 |
| }, |
| { |
| "epoch": 3.076923076923077, |
| "grad_norm": 23.553855895996094, |
| "learning_rate": 8.461538461538462e-06, |
| "loss": 3.646, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.0854700854700856, |
| "grad_norm": 18.651151657104492, |
| "learning_rate": 8.457264957264957e-06, |
| "loss": 3.761, |
| "step": 361 |
| }, |
| { |
| "epoch": 3.094017094017094, |
| "grad_norm": 23.437379837036133, |
| "learning_rate": 8.452991452991454e-06, |
| "loss": 3.9258, |
| "step": 362 |
| }, |
| { |
| "epoch": 3.1025641025641026, |
| "grad_norm": 19.025928497314453, |
| "learning_rate": 8.448717948717949e-06, |
| "loss": 3.4911, |
| "step": 363 |
| }, |
| { |
| "epoch": 3.111111111111111, |
| "grad_norm": 25.955963134765625, |
| "learning_rate": 8.444444444444446e-06, |
| "loss": 3.7231, |
| "step": 364 |
| }, |
| { |
| "epoch": 3.1196581196581197, |
| "grad_norm": 19.691673278808594, |
| "learning_rate": 8.44017094017094e-06, |
| "loss": 3.9225, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.128205128205128, |
| "grad_norm": 19.47168731689453, |
| "learning_rate": 8.435897435897436e-06, |
| "loss": 3.6261, |
| "step": 366 |
| }, |
| { |
| "epoch": 3.1367521367521367, |
| "grad_norm": 20.50010108947754, |
| "learning_rate": 8.431623931623932e-06, |
| "loss": 3.3306, |
| "step": 367 |
| }, |
| { |
| "epoch": 3.1452991452991452, |
| "grad_norm": 21.198938369750977, |
| "learning_rate": 8.427350427350428e-06, |
| "loss": 3.6388, |
| "step": 368 |
| }, |
| { |
| "epoch": 3.1538461538461537, |
| "grad_norm": 16.93203353881836, |
| "learning_rate": 8.423076923076924e-06, |
| "loss": 3.9556, |
| "step": 369 |
| }, |
| { |
| "epoch": 3.1623931623931623, |
| "grad_norm": 15.074128150939941, |
| "learning_rate": 8.41880341880342e-06, |
| "loss": 2.9899, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.1709401709401708, |
| "grad_norm": 23.041452407836914, |
| "learning_rate": 8.414529914529916e-06, |
| "loss": 3.291, |
| "step": 371 |
| }, |
| { |
| "epoch": 3.1794871794871793, |
| "grad_norm": 24.146419525146484, |
| "learning_rate": 8.410256410256411e-06, |
| "loss": 4.0683, |
| "step": 372 |
| }, |
| { |
| "epoch": 3.1880341880341883, |
| "grad_norm": 27.864879608154297, |
| "learning_rate": 8.405982905982906e-06, |
| "loss": 3.6171, |
| "step": 373 |
| }, |
| { |
| "epoch": 3.1965811965811968, |
| "grad_norm": 33.83136749267578, |
| "learning_rate": 8.401709401709403e-06, |
| "loss": 3.7324, |
| "step": 374 |
| }, |
| { |
| "epoch": 3.2051282051282053, |
| "grad_norm": 21.020702362060547, |
| "learning_rate": 8.397435897435898e-06, |
| "loss": 3.5688, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.213675213675214, |
| "grad_norm": 23.521453857421875, |
| "learning_rate": 8.393162393162395e-06, |
| "loss": 3.6917, |
| "step": 376 |
| }, |
| { |
| "epoch": 3.2222222222222223, |
| "grad_norm": 35.85578536987305, |
| "learning_rate": 8.38888888888889e-06, |
| "loss": 3.6532, |
| "step": 377 |
| }, |
| { |
| "epoch": 3.230769230769231, |
| "grad_norm": 26.080968856811523, |
| "learning_rate": 8.384615384615385e-06, |
| "loss": 3.8828, |
| "step": 378 |
| }, |
| { |
| "epoch": 3.2393162393162394, |
| "grad_norm": 20.829381942749023, |
| "learning_rate": 8.380341880341881e-06, |
| "loss": 3.8374, |
| "step": 379 |
| }, |
| { |
| "epoch": 3.247863247863248, |
| "grad_norm": 20.85077476501465, |
| "learning_rate": 8.376068376068377e-06, |
| "loss": 3.2896, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.2564102564102564, |
| "grad_norm": 19.036088943481445, |
| "learning_rate": 8.371794871794873e-06, |
| "loss": 3.4996, |
| "step": 381 |
| }, |
| { |
| "epoch": 3.264957264957265, |
| "grad_norm": 23.725513458251953, |
| "learning_rate": 8.367521367521368e-06, |
| "loss": 3.7686, |
| "step": 382 |
| }, |
| { |
| "epoch": 3.2735042735042734, |
| "grad_norm": 22.553386688232422, |
| "learning_rate": 8.363247863247865e-06, |
| "loss": 3.8476, |
| "step": 383 |
| }, |
| { |
| "epoch": 3.282051282051282, |
| "grad_norm": 20.263992309570312, |
| "learning_rate": 8.35897435897436e-06, |
| "loss": 3.3278, |
| "step": 384 |
| }, |
| { |
| "epoch": 3.2905982905982905, |
| "grad_norm": 22.47858238220215, |
| "learning_rate": 8.354700854700855e-06, |
| "loss": 3.5437, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.299145299145299, |
| "grad_norm": 24.14532470703125, |
| "learning_rate": 8.350427350427352e-06, |
| "loss": 3.696, |
| "step": 386 |
| }, |
| { |
| "epoch": 3.3076923076923075, |
| "grad_norm": 31.457847595214844, |
| "learning_rate": 8.346153846153847e-06, |
| "loss": 4.3065, |
| "step": 387 |
| }, |
| { |
| "epoch": 3.316239316239316, |
| "grad_norm": 24.503095626831055, |
| "learning_rate": 8.341880341880344e-06, |
| "loss": 3.4798, |
| "step": 388 |
| }, |
| { |
| "epoch": 3.324786324786325, |
| "grad_norm": 19.798818588256836, |
| "learning_rate": 8.337606837606839e-06, |
| "loss": 3.5323, |
| "step": 389 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 22.023189544677734, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 3.4088, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.341880341880342, |
| "grad_norm": 17.314960479736328, |
| "learning_rate": 8.32905982905983e-06, |
| "loss": 3.2462, |
| "step": 391 |
| }, |
| { |
| "epoch": 3.3504273504273505, |
| "grad_norm": 22.714536666870117, |
| "learning_rate": 8.324786324786326e-06, |
| "loss": 3.7863, |
| "step": 392 |
| }, |
| { |
| "epoch": 3.358974358974359, |
| "grad_norm": 27.710514068603516, |
| "learning_rate": 8.320512820512822e-06, |
| "loss": 3.6032, |
| "step": 393 |
| }, |
| { |
| "epoch": 3.3675213675213675, |
| "grad_norm": 23.35419464111328, |
| "learning_rate": 8.316239316239317e-06, |
| "loss": 3.5599, |
| "step": 394 |
| }, |
| { |
| "epoch": 3.376068376068376, |
| "grad_norm": 24.0956974029541, |
| "learning_rate": 8.311965811965812e-06, |
| "loss": 3.5186, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.3846153846153846, |
| "grad_norm": 22.09107780456543, |
| "learning_rate": 8.307692307692309e-06, |
| "loss": 3.4843, |
| "step": 396 |
| }, |
| { |
| "epoch": 3.393162393162393, |
| "grad_norm": 23.956623077392578, |
| "learning_rate": 8.303418803418804e-06, |
| "loss": 3.1625, |
| "step": 397 |
| }, |
| { |
| "epoch": 3.4017094017094016, |
| "grad_norm": 18.875917434692383, |
| "learning_rate": 8.299145299145301e-06, |
| "loss": 3.3494, |
| "step": 398 |
| }, |
| { |
| "epoch": 3.41025641025641, |
| "grad_norm": 33.475467681884766, |
| "learning_rate": 8.294871794871796e-06, |
| "loss": 3.9247, |
| "step": 399 |
| }, |
| { |
| "epoch": 3.4188034188034186, |
| "grad_norm": 16.28295135498047, |
| "learning_rate": 8.290598290598293e-06, |
| "loss": 3.7446, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.427350427350427, |
| "grad_norm": 24.205049514770508, |
| "learning_rate": 8.286324786324788e-06, |
| "loss": 3.343, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.435897435897436, |
| "grad_norm": 21.21460723876953, |
| "learning_rate": 8.282051282051283e-06, |
| "loss": 3.2437, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.4444444444444446, |
| "grad_norm": 36.8713264465332, |
| "learning_rate": 8.277777777777778e-06, |
| "loss": 3.5009, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.452991452991453, |
| "grad_norm": 26.85513687133789, |
| "learning_rate": 8.273504273504273e-06, |
| "loss": 3.7271, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.4615384615384617, |
| "grad_norm": 18.184600830078125, |
| "learning_rate": 8.26923076923077e-06, |
| "loss": 3.2216, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.47008547008547, |
| "grad_norm": 27.03692054748535, |
| "learning_rate": 8.264957264957265e-06, |
| "loss": 3.516, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.4786324786324787, |
| "grad_norm": 20.63736915588379, |
| "learning_rate": 8.260683760683761e-06, |
| "loss": 3.1349, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.4871794871794872, |
| "grad_norm": 22.467845916748047, |
| "learning_rate": 8.256410256410256e-06, |
| "loss": 3.3878, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.4957264957264957, |
| "grad_norm": 21.25887107849121, |
| "learning_rate": 8.252136752136753e-06, |
| "loss": 3.8298, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.5042735042735043, |
| "grad_norm": 47.3256721496582, |
| "learning_rate": 8.247863247863248e-06, |
| "loss": 3.5321, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.5128205128205128, |
| "grad_norm": 22.103790283203125, |
| "learning_rate": 8.243589743589743e-06, |
| "loss": 3.335, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.5213675213675213, |
| "grad_norm": 25.779077529907227, |
| "learning_rate": 8.23931623931624e-06, |
| "loss": 3.5047, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.52991452991453, |
| "grad_norm": 22.78207778930664, |
| "learning_rate": 8.235042735042735e-06, |
| "loss": 3.3827, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.5384615384615383, |
| "grad_norm": 22.41836166381836, |
| "learning_rate": 8.230769230769232e-06, |
| "loss": 3.4521, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.547008547008547, |
| "grad_norm": 60.29216384887695, |
| "learning_rate": 8.226495726495727e-06, |
| "loss": 3.4598, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "grad_norm": 25.27474021911621, |
| "learning_rate": 8.222222222222222e-06, |
| "loss": 3.7443, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.564102564102564, |
| "grad_norm": 25.297466278076172, |
| "learning_rate": 8.217948717948719e-06, |
| "loss": 3.3123, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.5726495726495724, |
| "grad_norm": 28.5858154296875, |
| "learning_rate": 8.213675213675214e-06, |
| "loss": 3.1801, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.5811965811965814, |
| "grad_norm": 20.05567741394043, |
| "learning_rate": 8.20940170940171e-06, |
| "loss": 3.7242, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.58974358974359, |
| "grad_norm": 32.33693313598633, |
| "learning_rate": 8.205128205128205e-06, |
| "loss": 3.3587, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.5982905982905984, |
| "grad_norm": 36.1716194152832, |
| "learning_rate": 8.200854700854702e-06, |
| "loss": 3.1573, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.606837606837607, |
| "grad_norm": 33.39027404785156, |
| "learning_rate": 8.196581196581197e-06, |
| "loss": 3.098, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.6153846153846154, |
| "grad_norm": 28.4794864654541, |
| "learning_rate": 8.192307692307692e-06, |
| "loss": 3.6403, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.623931623931624, |
| "grad_norm": 29.702611923217773, |
| "learning_rate": 8.188034188034189e-06, |
| "loss": 3.2569, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.6324786324786325, |
| "grad_norm": 24.73663902282715, |
| "learning_rate": 8.183760683760684e-06, |
| "loss": 3.0508, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.641025641025641, |
| "grad_norm": 29.606807708740234, |
| "learning_rate": 8.17948717948718e-06, |
| "loss": 3.2524, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.6495726495726495, |
| "grad_norm": 22.721933364868164, |
| "learning_rate": 8.175213675213676e-06, |
| "loss": 3.2583, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.658119658119658, |
| "grad_norm": 25.009403228759766, |
| "learning_rate": 8.17094017094017e-06, |
| "loss": 3.0678, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 25.776636123657227, |
| "learning_rate": 8.166666666666668e-06, |
| "loss": 3.1676, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.6752136752136755, |
| "grad_norm": 28.210241317749023, |
| "learning_rate": 8.162393162393163e-06, |
| "loss": 3.2869, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.683760683760684, |
| "grad_norm": 26.29328155517578, |
| "learning_rate": 8.15811965811966e-06, |
| "loss": 3.3618, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.6923076923076925, |
| "grad_norm": 19.813465118408203, |
| "learning_rate": 8.153846153846154e-06, |
| "loss": 3.0655, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.700854700854701, |
| "grad_norm": 29.718812942504883, |
| "learning_rate": 8.14957264957265e-06, |
| "loss": 3.1538, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.7094017094017095, |
| "grad_norm": 30.629135131835938, |
| "learning_rate": 8.145299145299146e-06, |
| "loss": 3.3252, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.717948717948718, |
| "grad_norm": 27.716825485229492, |
| "learning_rate": 8.141025641025641e-06, |
| "loss": 3.4083, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.7264957264957266, |
| "grad_norm": 39.23820877075195, |
| "learning_rate": 8.136752136752138e-06, |
| "loss": 3.3074, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.735042735042735, |
| "grad_norm": 34.516422271728516, |
| "learning_rate": 8.132478632478633e-06, |
| "loss": 3.3529, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.7435897435897436, |
| "grad_norm": 41.98606872558594, |
| "learning_rate": 8.12820512820513e-06, |
| "loss": 3.248, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.752136752136752, |
| "grad_norm": 27.99711799621582, |
| "learning_rate": 8.123931623931625e-06, |
| "loss": 3.3054, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.7606837606837606, |
| "grad_norm": 25.21969985961914, |
| "learning_rate": 8.11965811965812e-06, |
| "loss": 2.8518, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.769230769230769, |
| "grad_norm": 29.14298439025879, |
| "learning_rate": 8.115384615384617e-06, |
| "loss": 3.0063, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.7777777777777777, |
| "grad_norm": 27.040063858032227, |
| "learning_rate": 8.111111111111112e-06, |
| "loss": 3.3066, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.786324786324786, |
| "grad_norm": 365.3290100097656, |
| "learning_rate": 8.106837606837608e-06, |
| "loss": 3.8057, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.7948717948717947, |
| "grad_norm": 32.89745330810547, |
| "learning_rate": 8.102564102564103e-06, |
| "loss": 3.0903, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.8034188034188032, |
| "grad_norm": 29.448022842407227, |
| "learning_rate": 8.098290598290598e-06, |
| "loss": 3.2723, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.8119658119658117, |
| "grad_norm": 27.838123321533203, |
| "learning_rate": 8.094017094017095e-06, |
| "loss": 3.2903, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.8205128205128203, |
| "grad_norm": 29.047847747802734, |
| "learning_rate": 8.08974358974359e-06, |
| "loss": 2.9048, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.8290598290598292, |
| "grad_norm": 28.666589736938477, |
| "learning_rate": 8.085470085470087e-06, |
| "loss": 3.2186, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.8376068376068377, |
| "grad_norm": 31.796804428100586, |
| "learning_rate": 8.081196581196582e-06, |
| "loss": 3.2668, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "grad_norm": 22.665220260620117, |
| "learning_rate": 8.076923076923077e-06, |
| "loss": 3.0965, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.8547008547008548, |
| "grad_norm": 32.7353630065918, |
| "learning_rate": 8.072649572649574e-06, |
| "loss": 3.1759, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.8632478632478633, |
| "grad_norm": 32.95683670043945, |
| "learning_rate": 8.068376068376069e-06, |
| "loss": 2.9589, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.871794871794872, |
| "grad_norm": 30.04659652709961, |
| "learning_rate": 8.064102564102566e-06, |
| "loss": 3.4709, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.8803418803418803, |
| "grad_norm": 30.41158676147461, |
| "learning_rate": 8.05982905982906e-06, |
| "loss": 2.9385, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 30.059635162353516, |
| "learning_rate": 8.055555555555557e-06, |
| "loss": 3.0099, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.8974358974358974, |
| "grad_norm": 24.83198356628418, |
| "learning_rate": 8.051282051282052e-06, |
| "loss": 2.9783, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.905982905982906, |
| "grad_norm": 25.38758087158203, |
| "learning_rate": 8.047008547008547e-06, |
| "loss": 3.0275, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.9145299145299144, |
| "grad_norm": 25.21868133544922, |
| "learning_rate": 8.042735042735044e-06, |
| "loss": 2.9096, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.9230769230769234, |
| "grad_norm": 32.02922058105469, |
| "learning_rate": 8.03846153846154e-06, |
| "loss": 3.059, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.931623931623932, |
| "grad_norm": 22.240680694580078, |
| "learning_rate": 8.034188034188036e-06, |
| "loss": 2.9473, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.9401709401709404, |
| "grad_norm": 27.61838150024414, |
| "learning_rate": 8.029914529914531e-06, |
| "loss": 2.4506, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.948717948717949, |
| "grad_norm": 27.742216110229492, |
| "learning_rate": 8.025641025641026e-06, |
| "loss": 2.9082, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.9572649572649574, |
| "grad_norm": 29.965059280395508, |
| "learning_rate": 8.021367521367523e-06, |
| "loss": 2.8268, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.965811965811966, |
| "grad_norm": 31.429990768432617, |
| "learning_rate": 8.017094017094018e-06, |
| "loss": 3.1805, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.9743589743589745, |
| "grad_norm": 31.162532806396484, |
| "learning_rate": 8.012820512820515e-06, |
| "loss": 2.64, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.982905982905983, |
| "grad_norm": 28.240577697753906, |
| "learning_rate": 8.00854700854701e-06, |
| "loss": 3.249, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.9914529914529915, |
| "grad_norm": 48.52914810180664, |
| "learning_rate": 8.004273504273505e-06, |
| "loss": 3.1619, |
| "step": 467 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 36.80685806274414, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 3.5337, |
| "step": 468 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 2.1340389251708984, |
| "eval_runtime": 9.2211, |
| "eval_samples_per_second": 50.536, |
| "eval_steps_per_second": 6.398, |
| "step": 468 |
| }, |
| { |
| "epoch": 4.0085470085470085, |
| "grad_norm": 45.45211410522461, |
| "learning_rate": 7.995726495726496e-06, |
| "loss": 3.5596, |
| "step": 469 |
| }, |
| { |
| "epoch": 4.017094017094017, |
| "grad_norm": 32.711669921875, |
| "learning_rate": 7.991452991452993e-06, |
| "loss": 2.9362, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.0256410256410255, |
| "grad_norm": 26.151872634887695, |
| "learning_rate": 7.987179487179488e-06, |
| "loss": 2.6796, |
| "step": 471 |
| }, |
| { |
| "epoch": 4.034188034188034, |
| "grad_norm": 33.02329635620117, |
| "learning_rate": 7.982905982905985e-06, |
| "loss": 2.7147, |
| "step": 472 |
| }, |
| { |
| "epoch": 4.042735042735043, |
| "grad_norm": 31.1684513092041, |
| "learning_rate": 7.97863247863248e-06, |
| "loss": 3.2356, |
| "step": 473 |
| }, |
| { |
| "epoch": 4.051282051282051, |
| "grad_norm": 37.0435905456543, |
| "learning_rate": 7.974358974358975e-06, |
| "loss": 2.9954, |
| "step": 474 |
| }, |
| { |
| "epoch": 4.05982905982906, |
| "grad_norm": 25.989973068237305, |
| "learning_rate": 7.970085470085472e-06, |
| "loss": 3.2143, |
| "step": 475 |
| }, |
| { |
| "epoch": 4.068376068376068, |
| "grad_norm": 27.048690795898438, |
| "learning_rate": 7.965811965811967e-06, |
| "loss": 2.5087, |
| "step": 476 |
| }, |
| { |
| "epoch": 4.076923076923077, |
| "grad_norm": 26.857696533203125, |
| "learning_rate": 7.961538461538462e-06, |
| "loss": 2.6466, |
| "step": 477 |
| }, |
| { |
| "epoch": 4.085470085470085, |
| "grad_norm": 33.342193603515625, |
| "learning_rate": 7.957264957264957e-06, |
| "loss": 2.6591, |
| "step": 478 |
| }, |
| { |
| "epoch": 4.094017094017094, |
| "grad_norm": 64.21253967285156, |
| "learning_rate": 7.952991452991454e-06, |
| "loss": 3.0295, |
| "step": 479 |
| }, |
| { |
| "epoch": 4.102564102564102, |
| "grad_norm": 31.240161895751953, |
| "learning_rate": 7.948717948717949e-06, |
| "loss": 2.9374, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.111111111111111, |
| "grad_norm": 29.338851928710938, |
| "learning_rate": 7.944444444444445e-06, |
| "loss": 2.5019, |
| "step": 481 |
| }, |
| { |
| "epoch": 4.119658119658119, |
| "grad_norm": 36.79518127441406, |
| "learning_rate": 7.94017094017094e-06, |
| "loss": 2.7649, |
| "step": 482 |
| }, |
| { |
| "epoch": 4.128205128205128, |
| "grad_norm": 37.036739349365234, |
| "learning_rate": 7.935897435897435e-06, |
| "loss": 2.5182, |
| "step": 483 |
| }, |
| { |
| "epoch": 4.136752136752137, |
| "grad_norm": 42.571163177490234, |
| "learning_rate": 7.931623931623932e-06, |
| "loss": 2.767, |
| "step": 484 |
| }, |
| { |
| "epoch": 4.145299145299146, |
| "grad_norm": 33.72893524169922, |
| "learning_rate": 7.927350427350427e-06, |
| "loss": 3.1404, |
| "step": 485 |
| }, |
| { |
| "epoch": 4.153846153846154, |
| "grad_norm": 27.06032943725586, |
| "learning_rate": 7.923076923076924e-06, |
| "loss": 2.6825, |
| "step": 486 |
| }, |
| { |
| "epoch": 4.162393162393163, |
| "grad_norm": 31.8147029876709, |
| "learning_rate": 7.918803418803419e-06, |
| "loss": 2.5129, |
| "step": 487 |
| }, |
| { |
| "epoch": 4.170940170940171, |
| "grad_norm": 35.681793212890625, |
| "learning_rate": 7.914529914529914e-06, |
| "loss": 2.4793, |
| "step": 488 |
| }, |
| { |
| "epoch": 4.17948717948718, |
| "grad_norm": 159.4467315673828, |
| "learning_rate": 7.91025641025641e-06, |
| "loss": 3.5531, |
| "step": 489 |
| }, |
| { |
| "epoch": 4.188034188034188, |
| "grad_norm": 40.12252426147461, |
| "learning_rate": 7.905982905982906e-06, |
| "loss": 2.7095, |
| "step": 490 |
| }, |
| { |
| "epoch": 4.196581196581197, |
| "grad_norm": 27.05786895751953, |
| "learning_rate": 7.901709401709403e-06, |
| "loss": 2.5984, |
| "step": 491 |
| }, |
| { |
| "epoch": 4.205128205128205, |
| "grad_norm": 24.31035614013672, |
| "learning_rate": 7.897435897435898e-06, |
| "loss": 2.89, |
| "step": 492 |
| }, |
| { |
| "epoch": 4.213675213675214, |
| "grad_norm": 277.16156005859375, |
| "learning_rate": 7.893162393162394e-06, |
| "loss": 3.8076, |
| "step": 493 |
| }, |
| { |
| "epoch": 4.222222222222222, |
| "grad_norm": 29.722867965698242, |
| "learning_rate": 7.88888888888889e-06, |
| "loss": 2.4189, |
| "step": 494 |
| }, |
| { |
| "epoch": 4.230769230769231, |
| "grad_norm": 40.47605514526367, |
| "learning_rate": 7.884615384615384e-06, |
| "loss": 2.6225, |
| "step": 495 |
| }, |
| { |
| "epoch": 4.239316239316239, |
| "grad_norm": 29.136499404907227, |
| "learning_rate": 7.880341880341881e-06, |
| "loss": 2.5223, |
| "step": 496 |
| }, |
| { |
| "epoch": 4.247863247863248, |
| "grad_norm": 78.86258697509766, |
| "learning_rate": 7.876068376068376e-06, |
| "loss": 2.6587, |
| "step": 497 |
| }, |
| { |
| "epoch": 4.256410256410256, |
| "grad_norm": 24.473243713378906, |
| "learning_rate": 7.871794871794873e-06, |
| "loss": 2.456, |
| "step": 498 |
| }, |
| { |
| "epoch": 4.264957264957265, |
| "grad_norm": 80.45248413085938, |
| "learning_rate": 7.867521367521368e-06, |
| "loss": 3.1893, |
| "step": 499 |
| }, |
| { |
| "epoch": 4.273504273504273, |
| "grad_norm": 194.2708282470703, |
| "learning_rate": 7.863247863247863e-06, |
| "loss": 3.8294, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.282051282051282, |
| "grad_norm": 27.74302101135254, |
| "learning_rate": 7.85897435897436e-06, |
| "loss": 2.2506, |
| "step": 501 |
| }, |
| { |
| "epoch": 4.2905982905982905, |
| "grad_norm": 21.90385627746582, |
| "learning_rate": 7.854700854700855e-06, |
| "loss": 3.0985, |
| "step": 502 |
| }, |
| { |
| "epoch": 4.299145299145299, |
| "grad_norm": 50.30342102050781, |
| "learning_rate": 7.850427350427352e-06, |
| "loss": 2.526, |
| "step": 503 |
| }, |
| { |
| "epoch": 4.3076923076923075, |
| "grad_norm": 28.666881561279297, |
| "learning_rate": 7.846153846153847e-06, |
| "loss": 2.4213, |
| "step": 504 |
| }, |
| { |
| "epoch": 4.316239316239316, |
| "grad_norm": 27.927257537841797, |
| "learning_rate": 7.841880341880342e-06, |
| "loss": 2.6731, |
| "step": 505 |
| }, |
| { |
| "epoch": 4.3247863247863245, |
| "grad_norm": 36.12032699584961, |
| "learning_rate": 7.837606837606838e-06, |
| "loss": 2.3323, |
| "step": 506 |
| }, |
| { |
| "epoch": 4.333333333333333, |
| "grad_norm": 31.632287979125977, |
| "learning_rate": 7.833333333333333e-06, |
| "loss": 2.2966, |
| "step": 507 |
| }, |
| { |
| "epoch": 4.3418803418803416, |
| "grad_norm": 26.511537551879883, |
| "learning_rate": 7.82905982905983e-06, |
| "loss": 2.3422, |
| "step": 508 |
| }, |
| { |
| "epoch": 4.35042735042735, |
| "grad_norm": 31.429107666015625, |
| "learning_rate": 7.824786324786325e-06, |
| "loss": 2.6764, |
| "step": 509 |
| }, |
| { |
| "epoch": 4.358974358974359, |
| "grad_norm": 29.8817138671875, |
| "learning_rate": 7.820512820512822e-06, |
| "loss": 2.4358, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.367521367521368, |
| "grad_norm": 29.293964385986328, |
| "learning_rate": 7.816239316239317e-06, |
| "loss": 2.504, |
| "step": 511 |
| }, |
| { |
| "epoch": 4.3760683760683765, |
| "grad_norm": 23.624290466308594, |
| "learning_rate": 7.811965811965812e-06, |
| "loss": 2.0312, |
| "step": 512 |
| }, |
| { |
| "epoch": 4.384615384615385, |
| "grad_norm": 25.336505889892578, |
| "learning_rate": 7.807692307692309e-06, |
| "loss": 2.1045, |
| "step": 513 |
| }, |
| { |
| "epoch": 4.3931623931623935, |
| "grad_norm": 24.755443572998047, |
| "learning_rate": 7.803418803418804e-06, |
| "loss": 2.5754, |
| "step": 514 |
| }, |
| { |
| "epoch": 4.401709401709402, |
| "grad_norm": 29.29696273803711, |
| "learning_rate": 7.7991452991453e-06, |
| "loss": 2.562, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.410256410256411, |
| "grad_norm": 28.054868698120117, |
| "learning_rate": 7.794871794871796e-06, |
| "loss": 1.9815, |
| "step": 516 |
| }, |
| { |
| "epoch": 4.418803418803419, |
| "grad_norm": 20.894853591918945, |
| "learning_rate": 7.79059829059829e-06, |
| "loss": 2.5668, |
| "step": 517 |
| }, |
| { |
| "epoch": 4.427350427350428, |
| "grad_norm": 19.532094955444336, |
| "learning_rate": 7.786324786324787e-06, |
| "loss": 2.2314, |
| "step": 518 |
| }, |
| { |
| "epoch": 4.435897435897436, |
| "grad_norm": 27.919715881347656, |
| "learning_rate": 7.782051282051282e-06, |
| "loss": 1.9523, |
| "step": 519 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 21.91543960571289, |
| "learning_rate": 7.77777777777778e-06, |
| "loss": 2.559, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.452991452991453, |
| "grad_norm": 26.20106315612793, |
| "learning_rate": 7.773504273504274e-06, |
| "loss": 2.367, |
| "step": 521 |
| }, |
| { |
| "epoch": 4.461538461538462, |
| "grad_norm": 23.455419540405273, |
| "learning_rate": 7.76923076923077e-06, |
| "loss": 2.4132, |
| "step": 522 |
| }, |
| { |
| "epoch": 4.47008547008547, |
| "grad_norm": 49.62391662597656, |
| "learning_rate": 7.764957264957266e-06, |
| "loss": 1.8896, |
| "step": 523 |
| }, |
| { |
| "epoch": 4.478632478632479, |
| "grad_norm": 25.721101760864258, |
| "learning_rate": 7.760683760683761e-06, |
| "loss": 1.9918, |
| "step": 524 |
| }, |
| { |
| "epoch": 4.487179487179487, |
| "grad_norm": 22.906694412231445, |
| "learning_rate": 7.756410256410258e-06, |
| "loss": 2.1819, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.495726495726496, |
| "grad_norm": 28.5809268951416, |
| "learning_rate": 7.752136752136753e-06, |
| "loss": 2.0516, |
| "step": 526 |
| }, |
| { |
| "epoch": 4.504273504273504, |
| "grad_norm": 26.47665023803711, |
| "learning_rate": 7.74786324786325e-06, |
| "loss": 2.0081, |
| "step": 527 |
| }, |
| { |
| "epoch": 4.512820512820513, |
| "grad_norm": 27.221372604370117, |
| "learning_rate": 7.743589743589745e-06, |
| "loss": 2.0414, |
| "step": 528 |
| }, |
| { |
| "epoch": 4.521367521367521, |
| "grad_norm": 27.931568145751953, |
| "learning_rate": 7.73931623931624e-06, |
| "loss": 2.0335, |
| "step": 529 |
| }, |
| { |
| "epoch": 4.52991452991453, |
| "grad_norm": 25.567049026489258, |
| "learning_rate": 7.735042735042736e-06, |
| "loss": 2.0129, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.538461538461538, |
| "grad_norm": 30.897083282470703, |
| "learning_rate": 7.730769230769231e-06, |
| "loss": 2.3941, |
| "step": 531 |
| }, |
| { |
| "epoch": 4.547008547008547, |
| "grad_norm": 21.92133903503418, |
| "learning_rate": 7.726495726495728e-06, |
| "loss": 2.2563, |
| "step": 532 |
| }, |
| { |
| "epoch": 4.555555555555555, |
| "grad_norm": 27.053892135620117, |
| "learning_rate": 7.722222222222223e-06, |
| "loss": 2.2463, |
| "step": 533 |
| }, |
| { |
| "epoch": 4.564102564102564, |
| "grad_norm": 29.3230037689209, |
| "learning_rate": 7.717948717948718e-06, |
| "loss": 1.9167, |
| "step": 534 |
| }, |
| { |
| "epoch": 4.572649572649572, |
| "grad_norm": 36.06028747558594, |
| "learning_rate": 7.713675213675215e-06, |
| "loss": 1.9106, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.581196581196581, |
| "grad_norm": 24.622135162353516, |
| "learning_rate": 7.70940170940171e-06, |
| "loss": 2.2899, |
| "step": 536 |
| }, |
| { |
| "epoch": 4.589743589743589, |
| "grad_norm": 21.3137264251709, |
| "learning_rate": 7.705128205128207e-06, |
| "loss": 2.0166, |
| "step": 537 |
| }, |
| { |
| "epoch": 4.598290598290598, |
| "grad_norm": 21.939279556274414, |
| "learning_rate": 7.700854700854702e-06, |
| "loss": 2.3319, |
| "step": 538 |
| }, |
| { |
| "epoch": 4.6068376068376065, |
| "grad_norm": 25.496994018554688, |
| "learning_rate": 7.696581196581197e-06, |
| "loss": 2.6162, |
| "step": 539 |
| }, |
| { |
| "epoch": 4.615384615384615, |
| "grad_norm": 24.095666885375977, |
| "learning_rate": 7.692307692307694e-06, |
| "loss": 2.2863, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.6239316239316235, |
| "grad_norm": 31.96511459350586, |
| "learning_rate": 7.688034188034189e-06, |
| "loss": 2.0261, |
| "step": 541 |
| }, |
| { |
| "epoch": 4.632478632478632, |
| "grad_norm": 22.66115379333496, |
| "learning_rate": 7.683760683760685e-06, |
| "loss": 2.2786, |
| "step": 542 |
| }, |
| { |
| "epoch": 4.641025641025641, |
| "grad_norm": 23.661611557006836, |
| "learning_rate": 7.67948717948718e-06, |
| "loss": 1.7113, |
| "step": 543 |
| }, |
| { |
| "epoch": 4.64957264957265, |
| "grad_norm": 18.64708709716797, |
| "learning_rate": 7.675213675213677e-06, |
| "loss": 2.1389, |
| "step": 544 |
| }, |
| { |
| "epoch": 4.6581196581196584, |
| "grad_norm": 20.55480194091797, |
| "learning_rate": 7.670940170940172e-06, |
| "loss": 2.0831, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.666666666666667, |
| "grad_norm": 27.876964569091797, |
| "learning_rate": 7.666666666666667e-06, |
| "loss": 2.0358, |
| "step": 546 |
| }, |
| { |
| "epoch": 4.6752136752136755, |
| "grad_norm": 20.236507415771484, |
| "learning_rate": 7.662393162393164e-06, |
| "loss": 1.5596, |
| "step": 547 |
| }, |
| { |
| "epoch": 4.683760683760684, |
| "grad_norm": 23.360782623291016, |
| "learning_rate": 7.658119658119659e-06, |
| "loss": 1.9623, |
| "step": 548 |
| }, |
| { |
| "epoch": 4.6923076923076925, |
| "grad_norm": 41.7568359375, |
| "learning_rate": 7.653846153846154e-06, |
| "loss": 1.9884, |
| "step": 549 |
| }, |
| { |
| "epoch": 4.700854700854701, |
| "grad_norm": 28.651065826416016, |
| "learning_rate": 7.649572649572649e-06, |
| "loss": 2.1491, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.7094017094017095, |
| "grad_norm": 23.636432647705078, |
| "learning_rate": 7.645299145299146e-06, |
| "loss": 1.9352, |
| "step": 551 |
| }, |
| { |
| "epoch": 4.717948717948718, |
| "grad_norm": 25.313966751098633, |
| "learning_rate": 7.641025641025641e-06, |
| "loss": 2.4112, |
| "step": 552 |
| }, |
| { |
| "epoch": 4.726495726495727, |
| "grad_norm": 32.4974479675293, |
| "learning_rate": 7.636752136752138e-06, |
| "loss": 1.7017, |
| "step": 553 |
| }, |
| { |
| "epoch": 4.735042735042735, |
| "grad_norm": 20.644481658935547, |
| "learning_rate": 7.632478632478633e-06, |
| "loss": 1.6904, |
| "step": 554 |
| }, |
| { |
| "epoch": 4.743589743589744, |
| "grad_norm": 26.526721954345703, |
| "learning_rate": 7.6282051282051286e-06, |
| "loss": 2.1666, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.752136752136752, |
| "grad_norm": 23.375839233398438, |
| "learning_rate": 7.6239316239316244e-06, |
| "loss": 1.5555, |
| "step": 556 |
| }, |
| { |
| "epoch": 4.760683760683761, |
| "grad_norm": 29.890501022338867, |
| "learning_rate": 7.6196581196581195e-06, |
| "loss": 2.0195, |
| "step": 557 |
| }, |
| { |
| "epoch": 4.769230769230769, |
| "grad_norm": 687.5745239257812, |
| "learning_rate": 7.615384615384615e-06, |
| "loss": 2.4286, |
| "step": 558 |
| }, |
| { |
| "epoch": 4.777777777777778, |
| "grad_norm": 22.844587326049805, |
| "learning_rate": 7.611111111111111e-06, |
| "loss": 2.2335, |
| "step": 559 |
| }, |
| { |
| "epoch": 4.786324786324786, |
| "grad_norm": 29.633562088012695, |
| "learning_rate": 7.606837606837607e-06, |
| "loss": 1.7579, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.794871794871795, |
| "grad_norm": 48.04582977294922, |
| "learning_rate": 7.602564102564103e-06, |
| "loss": 2.3846, |
| "step": 561 |
| }, |
| { |
| "epoch": 4.803418803418803, |
| "grad_norm": 27.2290096282959, |
| "learning_rate": 7.598290598290599e-06, |
| "loss": 2.2234, |
| "step": 562 |
| }, |
| { |
| "epoch": 4.811965811965812, |
| "grad_norm": 29.782209396362305, |
| "learning_rate": 7.594017094017094e-06, |
| "loss": 2.0365, |
| "step": 563 |
| }, |
| { |
| "epoch": 4.82051282051282, |
| "grad_norm": 32.457061767578125, |
| "learning_rate": 7.58974358974359e-06, |
| "loss": 2.0451, |
| "step": 564 |
| }, |
| { |
| "epoch": 4.829059829059829, |
| "grad_norm": 22.089427947998047, |
| "learning_rate": 7.585470085470086e-06, |
| "loss": 1.7105, |
| "step": 565 |
| }, |
| { |
| "epoch": 4.837606837606837, |
| "grad_norm": 23.105140686035156, |
| "learning_rate": 7.581196581196582e-06, |
| "loss": 1.6817, |
| "step": 566 |
| }, |
| { |
| "epoch": 4.846153846153846, |
| "grad_norm": 24.513713836669922, |
| "learning_rate": 7.5769230769230775e-06, |
| "loss": 1.9553, |
| "step": 567 |
| }, |
| { |
| "epoch": 4.854700854700854, |
| "grad_norm": 22.187759399414062, |
| "learning_rate": 7.572649572649573e-06, |
| "loss": 2.0309, |
| "step": 568 |
| }, |
| { |
| "epoch": 4.863247863247864, |
| "grad_norm": 53.56728744506836, |
| "learning_rate": 7.5683760683760685e-06, |
| "loss": 2.6508, |
| "step": 569 |
| }, |
| { |
| "epoch": 4.871794871794872, |
| "grad_norm": 27.983978271484375, |
| "learning_rate": 7.564102564102564e-06, |
| "loss": 2.1942, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.880341880341881, |
| "grad_norm": 25.610252380371094, |
| "learning_rate": 7.55982905982906e-06, |
| "loss": 1.4151, |
| "step": 571 |
| }, |
| { |
| "epoch": 4.888888888888889, |
| "grad_norm": 19.856618881225586, |
| "learning_rate": 7.555555555555556e-06, |
| "loss": 1.6968, |
| "step": 572 |
| }, |
| { |
| "epoch": 4.897435897435898, |
| "grad_norm": 20.288606643676758, |
| "learning_rate": 7.551282051282052e-06, |
| "loss": 1.7494, |
| "step": 573 |
| }, |
| { |
| "epoch": 4.905982905982906, |
| "grad_norm": 23.206768035888672, |
| "learning_rate": 7.547008547008547e-06, |
| "loss": 2.1255, |
| "step": 574 |
| }, |
| { |
| "epoch": 4.914529914529915, |
| "grad_norm": 21.275257110595703, |
| "learning_rate": 7.542735042735043e-06, |
| "loss": 1.7442, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.923076923076923, |
| "grad_norm": 22.635417938232422, |
| "learning_rate": 7.538461538461539e-06, |
| "loss": 1.9129, |
| "step": 576 |
| }, |
| { |
| "epoch": 4.931623931623932, |
| "grad_norm": 21.440109252929688, |
| "learning_rate": 7.534188034188035e-06, |
| "loss": 2.0056, |
| "step": 577 |
| }, |
| { |
| "epoch": 4.94017094017094, |
| "grad_norm": 20.939407348632812, |
| "learning_rate": 7.529914529914531e-06, |
| "loss": 1.7231, |
| "step": 578 |
| }, |
| { |
| "epoch": 4.948717948717949, |
| "grad_norm": 16.189861297607422, |
| "learning_rate": 7.5256410256410265e-06, |
| "loss": 1.4255, |
| "step": 579 |
| }, |
| { |
| "epoch": 4.957264957264957, |
| "grad_norm": 23.6302547454834, |
| "learning_rate": 7.521367521367522e-06, |
| "loss": 1.6748, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.965811965811966, |
| "grad_norm": 22.29713249206543, |
| "learning_rate": 7.5170940170940175e-06, |
| "loss": 1.5285, |
| "step": 581 |
| }, |
| { |
| "epoch": 4.9743589743589745, |
| "grad_norm": 22.831275939941406, |
| "learning_rate": 7.512820512820513e-06, |
| "loss": 1.7742, |
| "step": 582 |
| }, |
| { |
| "epoch": 4.982905982905983, |
| "grad_norm": 630.5899658203125, |
| "learning_rate": 7.508547008547009e-06, |
| "loss": 2.8598, |
| "step": 583 |
| }, |
| { |
| "epoch": 4.9914529914529915, |
| "grad_norm": 22.880647659301758, |
| "learning_rate": 7.504273504273505e-06, |
| "loss": 1.6231, |
| "step": 584 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 21.379072189331055, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 1.3506, |
| "step": 585 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.8325614333152771, |
| "eval_runtime": 9.2303, |
| "eval_samples_per_second": 50.486, |
| "eval_steps_per_second": 6.392, |
| "step": 585 |
| }, |
| { |
| "epoch": 5.0085470085470085, |
| "grad_norm": 23.968698501586914, |
| "learning_rate": 7.495726495726496e-06, |
| "loss": 1.4263, |
| "step": 586 |
| }, |
| { |
| "epoch": 5.017094017094017, |
| "grad_norm": 24.880769729614258, |
| "learning_rate": 7.491452991452992e-06, |
| "loss": 1.4994, |
| "step": 587 |
| }, |
| { |
| "epoch": 5.0256410256410255, |
| "grad_norm": 23.4547176361084, |
| "learning_rate": 7.487179487179488e-06, |
| "loss": 1.671, |
| "step": 588 |
| }, |
| { |
| "epoch": 5.034188034188034, |
| "grad_norm": 17.382152557373047, |
| "learning_rate": 7.482905982905984e-06, |
| "loss": 1.3935, |
| "step": 589 |
| }, |
| { |
| "epoch": 5.042735042735043, |
| "grad_norm": 19.607717514038086, |
| "learning_rate": 7.47863247863248e-06, |
| "loss": 1.5652, |
| "step": 590 |
| }, |
| { |
| "epoch": 5.051282051282051, |
| "grad_norm": 27.735240936279297, |
| "learning_rate": 7.474358974358975e-06, |
| "loss": 1.5491, |
| "step": 591 |
| }, |
| { |
| "epoch": 5.05982905982906, |
| "grad_norm": 20.493412017822266, |
| "learning_rate": 7.4700854700854706e-06, |
| "loss": 1.9229, |
| "step": 592 |
| }, |
| { |
| "epoch": 5.068376068376068, |
| "grad_norm": 20.492137908935547, |
| "learning_rate": 7.4658119658119665e-06, |
| "loss": 1.5066, |
| "step": 593 |
| }, |
| { |
| "epoch": 5.076923076923077, |
| "grad_norm": 27.650495529174805, |
| "learning_rate": 7.461538461538462e-06, |
| "loss": 1.4228, |
| "step": 594 |
| }, |
| { |
| "epoch": 5.085470085470085, |
| "grad_norm": 22.38190269470215, |
| "learning_rate": 7.457264957264958e-06, |
| "loss": 1.6243, |
| "step": 595 |
| }, |
| { |
| "epoch": 5.094017094017094, |
| "grad_norm": 22.862489700317383, |
| "learning_rate": 7.452991452991454e-06, |
| "loss": 1.9224, |
| "step": 596 |
| }, |
| { |
| "epoch": 5.102564102564102, |
| "grad_norm": 17.368051528930664, |
| "learning_rate": 7.448717948717949e-06, |
| "loss": 1.3642, |
| "step": 597 |
| }, |
| { |
| "epoch": 5.111111111111111, |
| "grad_norm": 20.587018966674805, |
| "learning_rate": 7.444444444444445e-06, |
| "loss": 1.471, |
| "step": 598 |
| }, |
| { |
| "epoch": 5.119658119658119, |
| "grad_norm": 18.502887725830078, |
| "learning_rate": 7.440170940170941e-06, |
| "loss": 1.9841, |
| "step": 599 |
| }, |
| { |
| "epoch": 5.128205128205128, |
| "grad_norm": 21.305294036865234, |
| "learning_rate": 7.435897435897437e-06, |
| "loss": 1.8564, |
| "step": 600 |
| }, |
| { |
| "epoch": 5.136752136752137, |
| "grad_norm": 20.61264419555664, |
| "learning_rate": 7.431623931623933e-06, |
| "loss": 1.3554, |
| "step": 601 |
| }, |
| { |
| "epoch": 5.145299145299146, |
| "grad_norm": 19.05555534362793, |
| "learning_rate": 7.427350427350429e-06, |
| "loss": 1.6612, |
| "step": 602 |
| }, |
| { |
| "epoch": 5.153846153846154, |
| "grad_norm": 20.392446517944336, |
| "learning_rate": 7.423076923076924e-06, |
| "loss": 1.5071, |
| "step": 603 |
| }, |
| { |
| "epoch": 5.162393162393163, |
| "grad_norm": 22.007591247558594, |
| "learning_rate": 7.4188034188034196e-06, |
| "loss": 1.3356, |
| "step": 604 |
| }, |
| { |
| "epoch": 5.170940170940171, |
| "grad_norm": 18.928104400634766, |
| "learning_rate": 7.4145299145299155e-06, |
| "loss": 1.6214, |
| "step": 605 |
| }, |
| { |
| "epoch": 5.17948717948718, |
| "grad_norm": 21.151193618774414, |
| "learning_rate": 7.410256410256411e-06, |
| "loss": 1.5275, |
| "step": 606 |
| }, |
| { |
| "epoch": 5.188034188034188, |
| "grad_norm": 16.272262573242188, |
| "learning_rate": 7.405982905982907e-06, |
| "loss": 1.2773, |
| "step": 607 |
| }, |
| { |
| "epoch": 5.196581196581197, |
| "grad_norm": 21.59275245666504, |
| "learning_rate": 7.401709401709402e-06, |
| "loss": 1.3503, |
| "step": 608 |
| }, |
| { |
| "epoch": 5.205128205128205, |
| "grad_norm": 84.31806182861328, |
| "learning_rate": 7.397435897435898e-06, |
| "loss": 1.8618, |
| "step": 609 |
| }, |
| { |
| "epoch": 5.213675213675214, |
| "grad_norm": 20.374465942382812, |
| "learning_rate": 7.393162393162394e-06, |
| "loss": 1.6153, |
| "step": 610 |
| }, |
| { |
| "epoch": 5.222222222222222, |
| "grad_norm": 18.569623947143555, |
| "learning_rate": 7.38888888888889e-06, |
| "loss": 1.7101, |
| "step": 611 |
| }, |
| { |
| "epoch": 5.230769230769231, |
| "grad_norm": 19.51409339904785, |
| "learning_rate": 7.384615384615386e-06, |
| "loss": 1.5801, |
| "step": 612 |
| }, |
| { |
| "epoch": 5.239316239316239, |
| "grad_norm": 19.45322608947754, |
| "learning_rate": 7.380341880341882e-06, |
| "loss": 1.1376, |
| "step": 613 |
| }, |
| { |
| "epoch": 5.247863247863248, |
| "grad_norm": 23.474557876586914, |
| "learning_rate": 7.376068376068377e-06, |
| "loss": 1.442, |
| "step": 614 |
| }, |
| { |
| "epoch": 5.256410256410256, |
| "grad_norm": 21.458847045898438, |
| "learning_rate": 7.371794871794873e-06, |
| "loss": 1.2769, |
| "step": 615 |
| }, |
| { |
| "epoch": 5.264957264957265, |
| "grad_norm": 25.741121292114258, |
| "learning_rate": 7.3675213675213686e-06, |
| "loss": 1.3321, |
| "step": 616 |
| }, |
| { |
| "epoch": 5.273504273504273, |
| "grad_norm": 15.394718170166016, |
| "learning_rate": 7.3632478632478645e-06, |
| "loss": 1.2335, |
| "step": 617 |
| }, |
| { |
| "epoch": 5.282051282051282, |
| "grad_norm": 20.938871383666992, |
| "learning_rate": 7.35897435897436e-06, |
| "loss": 1.5741, |
| "step": 618 |
| }, |
| { |
| "epoch": 5.2905982905982905, |
| "grad_norm": 19.348268508911133, |
| "learning_rate": 7.354700854700856e-06, |
| "loss": 1.2493, |
| "step": 619 |
| }, |
| { |
| "epoch": 5.299145299145299, |
| "grad_norm": 25.26751708984375, |
| "learning_rate": 7.350427350427351e-06, |
| "loss": 1.5167, |
| "step": 620 |
| }, |
| { |
| "epoch": 5.3076923076923075, |
| "grad_norm": 22.099227905273438, |
| "learning_rate": 7.346153846153847e-06, |
| "loss": 1.3269, |
| "step": 621 |
| }, |
| { |
| "epoch": 5.316239316239316, |
| "grad_norm": 21.483428955078125, |
| "learning_rate": 7.341880341880342e-06, |
| "loss": 1.4249, |
| "step": 622 |
| }, |
| { |
| "epoch": 5.3247863247863245, |
| "grad_norm": 20.089691162109375, |
| "learning_rate": 7.337606837606837e-06, |
| "loss": 1.351, |
| "step": 623 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "grad_norm": 138.9898223876953, |
| "learning_rate": 7.333333333333333e-06, |
| "loss": 1.5682, |
| "step": 624 |
| }, |
| { |
| "epoch": 5.3418803418803416, |
| "grad_norm": 16.808000564575195, |
| "learning_rate": 7.329059829059829e-06, |
| "loss": 1.4794, |
| "step": 625 |
| }, |
| { |
| "epoch": 5.35042735042735, |
| "grad_norm": 18.58464813232422, |
| "learning_rate": 7.324786324786325e-06, |
| "loss": 1.4486, |
| "step": 626 |
| }, |
| { |
| "epoch": 5.358974358974359, |
| "grad_norm": 15.074477195739746, |
| "learning_rate": 7.320512820512821e-06, |
| "loss": 1.3124, |
| "step": 627 |
| }, |
| { |
| "epoch": 5.367521367521368, |
| "grad_norm": 15.800148963928223, |
| "learning_rate": 7.316239316239317e-06, |
| "loss": 1.7055, |
| "step": 628 |
| }, |
| { |
| "epoch": 5.3760683760683765, |
| "grad_norm": 19.166179656982422, |
| "learning_rate": 7.311965811965812e-06, |
| "loss": 1.7306, |
| "step": 629 |
| }, |
| { |
| "epoch": 5.384615384615385, |
| "grad_norm": 55.91648864746094, |
| "learning_rate": 7.307692307692308e-06, |
| "loss": 1.2376, |
| "step": 630 |
| }, |
| { |
| "epoch": 5.3931623931623935, |
| "grad_norm": 16.606033325195312, |
| "learning_rate": 7.3034188034188035e-06, |
| "loss": 1.1159, |
| "step": 631 |
| }, |
| { |
| "epoch": 5.401709401709402, |
| "grad_norm": 17.0134220123291, |
| "learning_rate": 7.299145299145299e-06, |
| "loss": 1.2124, |
| "step": 632 |
| }, |
| { |
| "epoch": 5.410256410256411, |
| "grad_norm": 17.511932373046875, |
| "learning_rate": 7.294871794871795e-06, |
| "loss": 1.4221, |
| "step": 633 |
| }, |
| { |
| "epoch": 5.418803418803419, |
| "grad_norm": 44.53416061401367, |
| "learning_rate": 7.290598290598291e-06, |
| "loss": 1.9583, |
| "step": 634 |
| }, |
| { |
| "epoch": 5.427350427350428, |
| "grad_norm": 16.546630859375, |
| "learning_rate": 7.286324786324786e-06, |
| "loss": 1.1722, |
| "step": 635 |
| }, |
| { |
| "epoch": 5.435897435897436, |
| "grad_norm": 39.90822982788086, |
| "learning_rate": 7.282051282051282e-06, |
| "loss": 1.7482, |
| "step": 636 |
| }, |
| { |
| "epoch": 5.444444444444445, |
| "grad_norm": 16.186573028564453, |
| "learning_rate": 7.277777777777778e-06, |
| "loss": 1.3422, |
| "step": 637 |
| }, |
| { |
| "epoch": 5.452991452991453, |
| "grad_norm": 18.84516143798828, |
| "learning_rate": 7.273504273504274e-06, |
| "loss": 1.3299, |
| "step": 638 |
| }, |
| { |
| "epoch": 5.461538461538462, |
| "grad_norm": 14.620058059692383, |
| "learning_rate": 7.26923076923077e-06, |
| "loss": 1.0604, |
| "step": 639 |
| }, |
| { |
| "epoch": 5.47008547008547, |
| "grad_norm": 16.5911865234375, |
| "learning_rate": 7.264957264957266e-06, |
| "loss": 1.1138, |
| "step": 640 |
| }, |
| { |
| "epoch": 5.478632478632479, |
| "grad_norm": 15.44485092163086, |
| "learning_rate": 7.260683760683761e-06, |
| "loss": 1.435, |
| "step": 641 |
| }, |
| { |
| "epoch": 5.487179487179487, |
| "grad_norm": 121.76724243164062, |
| "learning_rate": 7.256410256410257e-06, |
| "loss": 1.7167, |
| "step": 642 |
| }, |
| { |
| "epoch": 5.495726495726496, |
| "grad_norm": 1996.141357421875, |
| "learning_rate": 7.2521367521367525e-06, |
| "loss": 4.0296, |
| "step": 643 |
| }, |
| { |
| "epoch": 5.504273504273504, |
| "grad_norm": 15.072067260742188, |
| "learning_rate": 7.247863247863248e-06, |
| "loss": 1.0455, |
| "step": 644 |
| }, |
| { |
| "epoch": 5.512820512820513, |
| "grad_norm": 16.684345245361328, |
| "learning_rate": 7.243589743589744e-06, |
| "loss": 1.7565, |
| "step": 645 |
| }, |
| { |
| "epoch": 5.521367521367521, |
| "grad_norm": 15.515148162841797, |
| "learning_rate": 7.239316239316239e-06, |
| "loss": 1.4601, |
| "step": 646 |
| }, |
| { |
| "epoch": 5.52991452991453, |
| "grad_norm": 20.1015625, |
| "learning_rate": 7.235042735042735e-06, |
| "loss": 1.073, |
| "step": 647 |
| }, |
| { |
| "epoch": 5.538461538461538, |
| "grad_norm": 67.10873413085938, |
| "learning_rate": 7.230769230769231e-06, |
| "loss": 1.8586, |
| "step": 648 |
| }, |
| { |
| "epoch": 5.547008547008547, |
| "grad_norm": 13.775193214416504, |
| "learning_rate": 7.226495726495727e-06, |
| "loss": 1.2891, |
| "step": 649 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 14.612048149108887, |
| "learning_rate": 7.222222222222223e-06, |
| "loss": 1.033, |
| "step": 650 |
| }, |
| { |
| "epoch": 5.564102564102564, |
| "grad_norm": 14.512042999267578, |
| "learning_rate": 7.217948717948719e-06, |
| "loss": 1.1446, |
| "step": 651 |
| }, |
| { |
| "epoch": 5.572649572649572, |
| "grad_norm": 13.720820426940918, |
| "learning_rate": 7.213675213675214e-06, |
| "loss": 1.1246, |
| "step": 652 |
| }, |
| { |
| "epoch": 5.581196581196581, |
| "grad_norm": 16.548046112060547, |
| "learning_rate": 7.20940170940171e-06, |
| "loss": 1.3162, |
| "step": 653 |
| }, |
| { |
| "epoch": 5.589743589743589, |
| "grad_norm": 20.535181045532227, |
| "learning_rate": 7.205128205128206e-06, |
| "loss": 1.3019, |
| "step": 654 |
| }, |
| { |
| "epoch": 5.598290598290598, |
| "grad_norm": 14.317465782165527, |
| "learning_rate": 7.2008547008547015e-06, |
| "loss": 1.5447, |
| "step": 655 |
| }, |
| { |
| "epoch": 5.6068376068376065, |
| "grad_norm": 16.23088836669922, |
| "learning_rate": 7.196581196581197e-06, |
| "loss": 1.2701, |
| "step": 656 |
| }, |
| { |
| "epoch": 5.615384615384615, |
| "grad_norm": 13.754173278808594, |
| "learning_rate": 7.192307692307693e-06, |
| "loss": 1.2218, |
| "step": 657 |
| }, |
| { |
| "epoch": 5.6239316239316235, |
| "grad_norm": 75.77688598632812, |
| "learning_rate": 7.188034188034188e-06, |
| "loss": 1.7547, |
| "step": 658 |
| }, |
| { |
| "epoch": 5.632478632478632, |
| "grad_norm": 19.452077865600586, |
| "learning_rate": 7.183760683760684e-06, |
| "loss": 1.1446, |
| "step": 659 |
| }, |
| { |
| "epoch": 5.641025641025641, |
| "grad_norm": 14.513677597045898, |
| "learning_rate": 7.17948717948718e-06, |
| "loss": 1.0527, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.64957264957265, |
| "grad_norm": 27.67446517944336, |
| "learning_rate": 7.175213675213676e-06, |
| "loss": 1.1953, |
| "step": 661 |
| }, |
| { |
| "epoch": 5.6581196581196584, |
| "grad_norm": 12.137639999389648, |
| "learning_rate": 7.170940170940172e-06, |
| "loss": 1.1127, |
| "step": 662 |
| }, |
| { |
| "epoch": 5.666666666666667, |
| "grad_norm": 17.2878475189209, |
| "learning_rate": 7.166666666666667e-06, |
| "loss": 1.0475, |
| "step": 663 |
| }, |
| { |
| "epoch": 5.6752136752136755, |
| "grad_norm": 28.070842742919922, |
| "learning_rate": 7.162393162393163e-06, |
| "loss": 1.6271, |
| "step": 664 |
| }, |
| { |
| "epoch": 5.683760683760684, |
| "grad_norm": 17.74942398071289, |
| "learning_rate": 7.158119658119659e-06, |
| "loss": 1.1759, |
| "step": 665 |
| }, |
| { |
| "epoch": 5.6923076923076925, |
| "grad_norm": 19.545486450195312, |
| "learning_rate": 7.153846153846155e-06, |
| "loss": 0.9753, |
| "step": 666 |
| }, |
| { |
| "epoch": 5.700854700854701, |
| "grad_norm": 24.34153938293457, |
| "learning_rate": 7.1495726495726505e-06, |
| "loss": 1.0905, |
| "step": 667 |
| }, |
| { |
| "epoch": 5.7094017094017095, |
| "grad_norm": 211.7845001220703, |
| "learning_rate": 7.145299145299146e-06, |
| "loss": 1.6455, |
| "step": 668 |
| }, |
| { |
| "epoch": 5.717948717948718, |
| "grad_norm": 14.03074836730957, |
| "learning_rate": 7.1410256410256414e-06, |
| "loss": 1.3728, |
| "step": 669 |
| }, |
| { |
| "epoch": 5.726495726495727, |
| "grad_norm": 27.600345611572266, |
| "learning_rate": 7.136752136752137e-06, |
| "loss": 1.4212, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.735042735042735, |
| "grad_norm": 15.755846977233887, |
| "learning_rate": 7.132478632478633e-06, |
| "loss": 1.148, |
| "step": 671 |
| }, |
| { |
| "epoch": 5.743589743589744, |
| "grad_norm": 12.816133499145508, |
| "learning_rate": 7.128205128205129e-06, |
| "loss": 1.0053, |
| "step": 672 |
| }, |
| { |
| "epoch": 5.752136752136752, |
| "grad_norm": 25.097660064697266, |
| "learning_rate": 7.123931623931625e-06, |
| "loss": 1.1561, |
| "step": 673 |
| }, |
| { |
| "epoch": 5.760683760683761, |
| "grad_norm": 19.249279022216797, |
| "learning_rate": 7.119658119658121e-06, |
| "loss": 1.2582, |
| "step": 674 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 18.606924057006836, |
| "learning_rate": 7.115384615384616e-06, |
| "loss": 0.8569, |
| "step": 675 |
| }, |
| { |
| "epoch": 5.777777777777778, |
| "grad_norm": 20.2148380279541, |
| "learning_rate": 7.111111111111112e-06, |
| "loss": 1.1126, |
| "step": 676 |
| }, |
| { |
| "epoch": 5.786324786324786, |
| "grad_norm": 18.623268127441406, |
| "learning_rate": 7.106837606837608e-06, |
| "loss": 1.6129, |
| "step": 677 |
| }, |
| { |
| "epoch": 5.794871794871795, |
| "grad_norm": 14.888258934020996, |
| "learning_rate": 7.102564102564104e-06, |
| "loss": 1.2533, |
| "step": 678 |
| }, |
| { |
| "epoch": 5.803418803418803, |
| "grad_norm": 15.351551055908203, |
| "learning_rate": 7.0982905982905995e-06, |
| "loss": 1.2392, |
| "step": 679 |
| }, |
| { |
| "epoch": 5.811965811965812, |
| "grad_norm": 23.243993759155273, |
| "learning_rate": 7.0940170940170945e-06, |
| "loss": 1.3136, |
| "step": 680 |
| }, |
| { |
| "epoch": 5.82051282051282, |
| "grad_norm": 18.346277236938477, |
| "learning_rate": 7.0897435897435904e-06, |
| "loss": 1.5691, |
| "step": 681 |
| }, |
| { |
| "epoch": 5.829059829059829, |
| "grad_norm": 12.904829025268555, |
| "learning_rate": 7.085470085470086e-06, |
| "loss": 0.9248, |
| "step": 682 |
| }, |
| { |
| "epoch": 5.837606837606837, |
| "grad_norm": 13.263056755065918, |
| "learning_rate": 7.081196581196582e-06, |
| "loss": 1.0555, |
| "step": 683 |
| }, |
| { |
| "epoch": 5.846153846153846, |
| "grad_norm": 19.311899185180664, |
| "learning_rate": 7.076923076923078e-06, |
| "loss": 1.4341, |
| "step": 684 |
| }, |
| { |
| "epoch": 5.854700854700854, |
| "grad_norm": 282.1452331542969, |
| "learning_rate": 7.072649572649574e-06, |
| "loss": 1.9797, |
| "step": 685 |
| }, |
| { |
| "epoch": 5.863247863247864, |
| "grad_norm": 14.317438125610352, |
| "learning_rate": 7.068376068376069e-06, |
| "loss": 0.839, |
| "step": 686 |
| }, |
| { |
| "epoch": 5.871794871794872, |
| "grad_norm": 13.549150466918945, |
| "learning_rate": 7.064102564102565e-06, |
| "loss": 1.1003, |
| "step": 687 |
| }, |
| { |
| "epoch": 5.880341880341881, |
| "grad_norm": 14.283610343933105, |
| "learning_rate": 7.059829059829061e-06, |
| "loss": 1.0297, |
| "step": 688 |
| }, |
| { |
| "epoch": 5.888888888888889, |
| "grad_norm": 18.737884521484375, |
| "learning_rate": 7.055555555555557e-06, |
| "loss": 0.9817, |
| "step": 689 |
| }, |
| { |
| "epoch": 5.897435897435898, |
| "grad_norm": 24.12625503540039, |
| "learning_rate": 7.051282051282053e-06, |
| "loss": 1.1837, |
| "step": 690 |
| }, |
| { |
| "epoch": 5.905982905982906, |
| "grad_norm": 11.760732650756836, |
| "learning_rate": 7.0470085470085485e-06, |
| "loss": 1.5131, |
| "step": 691 |
| }, |
| { |
| "epoch": 5.914529914529915, |
| "grad_norm": 16.138668060302734, |
| "learning_rate": 7.0427350427350435e-06, |
| "loss": 0.9569, |
| "step": 692 |
| }, |
| { |
| "epoch": 5.923076923076923, |
| "grad_norm": 17.727285385131836, |
| "learning_rate": 7.038461538461539e-06, |
| "loss": 0.9834, |
| "step": 693 |
| }, |
| { |
| "epoch": 5.931623931623932, |
| "grad_norm": 13.434252738952637, |
| "learning_rate": 7.034188034188035e-06, |
| "loss": 1.3635, |
| "step": 694 |
| }, |
| { |
| "epoch": 5.94017094017094, |
| "grad_norm": 15.587186813354492, |
| "learning_rate": 7.02991452991453e-06, |
| "loss": 1.4814, |
| "step": 695 |
| }, |
| { |
| "epoch": 5.948717948717949, |
| "grad_norm": 31.379039764404297, |
| "learning_rate": 7.025641025641025e-06, |
| "loss": 0.8792, |
| "step": 696 |
| }, |
| { |
| "epoch": 5.957264957264957, |
| "grad_norm": 14.575559616088867, |
| "learning_rate": 7.021367521367521e-06, |
| "loss": 0.8865, |
| "step": 697 |
| }, |
| { |
| "epoch": 5.965811965811966, |
| "grad_norm": 13.55718994140625, |
| "learning_rate": 7.017094017094017e-06, |
| "loss": 0.9564, |
| "step": 698 |
| }, |
| { |
| "epoch": 5.9743589743589745, |
| "grad_norm": 13.288110733032227, |
| "learning_rate": 7.012820512820513e-06, |
| "loss": 0.8117, |
| "step": 699 |
| }, |
| { |
| "epoch": 5.982905982905983, |
| "grad_norm": 14.522254943847656, |
| "learning_rate": 7.008547008547009e-06, |
| "loss": 1.2037, |
| "step": 700 |
| }, |
| { |
| "epoch": 5.9914529914529915, |
| "grad_norm": 14.575456619262695, |
| "learning_rate": 7.004273504273504e-06, |
| "loss": 1.028, |
| "step": 701 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 13.18249225616455, |
| "learning_rate": 7e-06, |
| "loss": 0.6528, |
| "step": 702 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.4769609868526459, |
| "eval_runtime": 9.253, |
| "eval_samples_per_second": 50.362, |
| "eval_steps_per_second": 6.376, |
| "step": 702 |
| }, |
| { |
| "epoch": 6.0085470085470085, |
| "grad_norm": 17.034433364868164, |
| "learning_rate": 6.995726495726496e-06, |
| "loss": 0.847, |
| "step": 703 |
| }, |
| { |
| "epoch": 6.017094017094017, |
| "grad_norm": 13.455194473266602, |
| "learning_rate": 6.991452991452992e-06, |
| "loss": 0.8545, |
| "step": 704 |
| }, |
| { |
| "epoch": 6.0256410256410255, |
| "grad_norm": 14.511704444885254, |
| "learning_rate": 6.9871794871794876e-06, |
| "loss": 0.9365, |
| "step": 705 |
| }, |
| { |
| "epoch": 6.034188034188034, |
| "grad_norm": 14.325255393981934, |
| "learning_rate": 6.9829059829059835e-06, |
| "loss": 0.869, |
| "step": 706 |
| }, |
| { |
| "epoch": 6.042735042735043, |
| "grad_norm": 12.944524765014648, |
| "learning_rate": 6.9786324786324785e-06, |
| "loss": 1.1417, |
| "step": 707 |
| }, |
| { |
| "epoch": 6.051282051282051, |
| "grad_norm": 14.992669105529785, |
| "learning_rate": 6.974358974358974e-06, |
| "loss": 1.4935, |
| "step": 708 |
| }, |
| { |
| "epoch": 6.05982905982906, |
| "grad_norm": 15.394392013549805, |
| "learning_rate": 6.97008547008547e-06, |
| "loss": 1.519, |
| "step": 709 |
| }, |
| { |
| "epoch": 6.068376068376068, |
| "grad_norm": 12.605085372924805, |
| "learning_rate": 6.965811965811966e-06, |
| "loss": 1.4419, |
| "step": 710 |
| }, |
| { |
| "epoch": 6.076923076923077, |
| "grad_norm": 16.47636604309082, |
| "learning_rate": 6.961538461538462e-06, |
| "loss": 0.9552, |
| "step": 711 |
| }, |
| { |
| "epoch": 6.085470085470085, |
| "grad_norm": 17.04586410522461, |
| "learning_rate": 6.957264957264958e-06, |
| "loss": 0.9847, |
| "step": 712 |
| }, |
| { |
| "epoch": 6.094017094017094, |
| "grad_norm": 15.464738845825195, |
| "learning_rate": 6.952991452991453e-06, |
| "loss": 0.9272, |
| "step": 713 |
| }, |
| { |
| "epoch": 6.102564102564102, |
| "grad_norm": 11.837206840515137, |
| "learning_rate": 6.948717948717949e-06, |
| "loss": 1.1682, |
| "step": 714 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 11.013447761535645, |
| "learning_rate": 6.944444444444445e-06, |
| "loss": 1.222, |
| "step": 715 |
| }, |
| { |
| "epoch": 6.119658119658119, |
| "grad_norm": 15.37415885925293, |
| "learning_rate": 6.940170940170941e-06, |
| "loss": 0.9668, |
| "step": 716 |
| }, |
| { |
| "epoch": 6.128205128205128, |
| "grad_norm": 14.077155113220215, |
| "learning_rate": 6.9358974358974366e-06, |
| "loss": 0.8448, |
| "step": 717 |
| }, |
| { |
| "epoch": 6.136752136752137, |
| "grad_norm": 13.440519332885742, |
| "learning_rate": 6.931623931623932e-06, |
| "loss": 0.891, |
| "step": 718 |
| }, |
| { |
| "epoch": 6.145299145299146, |
| "grad_norm": 13.059304237365723, |
| "learning_rate": 6.9273504273504275e-06, |
| "loss": 0.655, |
| "step": 719 |
| }, |
| { |
| "epoch": 6.153846153846154, |
| "grad_norm": 12.96674633026123, |
| "learning_rate": 6.923076923076923e-06, |
| "loss": 0.7755, |
| "step": 720 |
| }, |
| { |
| "epoch": 6.162393162393163, |
| "grad_norm": 10.921567916870117, |
| "learning_rate": 6.918803418803419e-06, |
| "loss": 0.8533, |
| "step": 721 |
| }, |
| { |
| "epoch": 6.170940170940171, |
| "grad_norm": 10.439260482788086, |
| "learning_rate": 6.914529914529915e-06, |
| "loss": 0.8294, |
| "step": 722 |
| }, |
| { |
| "epoch": 6.17948717948718, |
| "grad_norm": 14.948200225830078, |
| "learning_rate": 6.910256410256411e-06, |
| "loss": 0.7326, |
| "step": 723 |
| }, |
| { |
| "epoch": 6.188034188034188, |
| "grad_norm": 12.733176231384277, |
| "learning_rate": 6.905982905982906e-06, |
| "loss": 1.0244, |
| "step": 724 |
| }, |
| { |
| "epoch": 6.196581196581197, |
| "grad_norm": 12.432938575744629, |
| "learning_rate": 6.901709401709402e-06, |
| "loss": 0.7375, |
| "step": 725 |
| }, |
| { |
| "epoch": 6.205128205128205, |
| "grad_norm": 12.047768592834473, |
| "learning_rate": 6.897435897435898e-06, |
| "loss": 0.8348, |
| "step": 726 |
| }, |
| { |
| "epoch": 6.213675213675214, |
| "grad_norm": 19.029287338256836, |
| "learning_rate": 6.893162393162394e-06, |
| "loss": 0.6091, |
| "step": 727 |
| }, |
| { |
| "epoch": 6.222222222222222, |
| "grad_norm": 11.650983810424805, |
| "learning_rate": 6.88888888888889e-06, |
| "loss": 0.9925, |
| "step": 728 |
| }, |
| { |
| "epoch": 6.230769230769231, |
| "grad_norm": 12.12030029296875, |
| "learning_rate": 6.8846153846153855e-06, |
| "loss": 1.0205, |
| "step": 729 |
| }, |
| { |
| "epoch": 6.239316239316239, |
| "grad_norm": 10.283143997192383, |
| "learning_rate": 6.880341880341881e-06, |
| "loss": 0.7726, |
| "step": 730 |
| }, |
| { |
| "epoch": 6.247863247863248, |
| "grad_norm": 12.965302467346191, |
| "learning_rate": 6.8760683760683765e-06, |
| "loss": 1.1761, |
| "step": 731 |
| }, |
| { |
| "epoch": 6.256410256410256, |
| "grad_norm": 9.0562105178833, |
| "learning_rate": 6.871794871794872e-06, |
| "loss": 0.9769, |
| "step": 732 |
| }, |
| { |
| "epoch": 6.264957264957265, |
| "grad_norm": 13.647340774536133, |
| "learning_rate": 6.867521367521368e-06, |
| "loss": 0.7613, |
| "step": 733 |
| }, |
| { |
| "epoch": 6.273504273504273, |
| "grad_norm": 11.598361015319824, |
| "learning_rate": 6.863247863247864e-06, |
| "loss": 0.6236, |
| "step": 734 |
| }, |
| { |
| "epoch": 6.282051282051282, |
| "grad_norm": 10.453935623168945, |
| "learning_rate": 6.858974358974359e-06, |
| "loss": 0.9752, |
| "step": 735 |
| }, |
| { |
| "epoch": 6.2905982905982905, |
| "grad_norm": 14.108942985534668, |
| "learning_rate": 6.854700854700855e-06, |
| "loss": 0.9212, |
| "step": 736 |
| }, |
| { |
| "epoch": 6.299145299145299, |
| "grad_norm": 21.230859756469727, |
| "learning_rate": 6.850427350427351e-06, |
| "loss": 0.9213, |
| "step": 737 |
| }, |
| { |
| "epoch": 6.3076923076923075, |
| "grad_norm": 11.801465034484863, |
| "learning_rate": 6.846153846153847e-06, |
| "loss": 0.8182, |
| "step": 738 |
| }, |
| { |
| "epoch": 6.316239316239316, |
| "grad_norm": 18.9310302734375, |
| "learning_rate": 6.841880341880343e-06, |
| "loss": 0.6214, |
| "step": 739 |
| }, |
| { |
| "epoch": 6.3247863247863245, |
| "grad_norm": 11.773117065429688, |
| "learning_rate": 6.837606837606839e-06, |
| "loss": 0.6221, |
| "step": 740 |
| }, |
| { |
| "epoch": 6.333333333333333, |
| "grad_norm": 187.00250244140625, |
| "learning_rate": 6.833333333333334e-06, |
| "loss": 1.5211, |
| "step": 741 |
| }, |
| { |
| "epoch": 6.3418803418803416, |
| "grad_norm": 70.96250915527344, |
| "learning_rate": 6.82905982905983e-06, |
| "loss": 1.3472, |
| "step": 742 |
| }, |
| { |
| "epoch": 6.35042735042735, |
| "grad_norm": 11.787941932678223, |
| "learning_rate": 6.8247863247863255e-06, |
| "loss": 0.8831, |
| "step": 743 |
| }, |
| { |
| "epoch": 6.358974358974359, |
| "grad_norm": 11.33661937713623, |
| "learning_rate": 6.820512820512821e-06, |
| "loss": 1.0555, |
| "step": 744 |
| }, |
| { |
| "epoch": 6.367521367521368, |
| "grad_norm": 14.255888938903809, |
| "learning_rate": 6.816239316239317e-06, |
| "loss": 0.8246, |
| "step": 745 |
| }, |
| { |
| "epoch": 6.3760683760683765, |
| "grad_norm": 10.89616870880127, |
| "learning_rate": 6.811965811965813e-06, |
| "loss": 1.0179, |
| "step": 746 |
| }, |
| { |
| "epoch": 6.384615384615385, |
| "grad_norm": 9.160380363464355, |
| "learning_rate": 6.807692307692308e-06, |
| "loss": 0.9019, |
| "step": 747 |
| }, |
| { |
| "epoch": 6.3931623931623935, |
| "grad_norm": 12.984644889831543, |
| "learning_rate": 6.803418803418804e-06, |
| "loss": 0.649, |
| "step": 748 |
| }, |
| { |
| "epoch": 6.401709401709402, |
| "grad_norm": 14.073376655578613, |
| "learning_rate": 6.7991452991453e-06, |
| "loss": 0.608, |
| "step": 749 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "grad_norm": 10.354485511779785, |
| "learning_rate": 6.794871794871796e-06, |
| "loss": 0.8812, |
| "step": 750 |
| }, |
| { |
| "epoch": 6.418803418803419, |
| "grad_norm": 9.121294975280762, |
| "learning_rate": 6.790598290598292e-06, |
| "loss": 0.768, |
| "step": 751 |
| }, |
| { |
| "epoch": 6.427350427350428, |
| "grad_norm": 10.909361839294434, |
| "learning_rate": 6.786324786324787e-06, |
| "loss": 0.8697, |
| "step": 752 |
| }, |
| { |
| "epoch": 6.435897435897436, |
| "grad_norm": 26.324186325073242, |
| "learning_rate": 6.782051282051283e-06, |
| "loss": 1.2437, |
| "step": 753 |
| }, |
| { |
| "epoch": 6.444444444444445, |
| "grad_norm": 11.972411155700684, |
| "learning_rate": 6.777777777777779e-06, |
| "loss": 0.6366, |
| "step": 754 |
| }, |
| { |
| "epoch": 6.452991452991453, |
| "grad_norm": 25.042150497436523, |
| "learning_rate": 6.7735042735042745e-06, |
| "loss": 1.0371, |
| "step": 755 |
| }, |
| { |
| "epoch": 6.461538461538462, |
| "grad_norm": 10.331900596618652, |
| "learning_rate": 6.76923076923077e-06, |
| "loss": 0.5618, |
| "step": 756 |
| }, |
| { |
| "epoch": 6.47008547008547, |
| "grad_norm": 11.925344467163086, |
| "learning_rate": 6.764957264957266e-06, |
| "loss": 0.629, |
| "step": 757 |
| }, |
| { |
| "epoch": 6.478632478632479, |
| "grad_norm": 10.309441566467285, |
| "learning_rate": 6.760683760683761e-06, |
| "loss": 0.7158, |
| "step": 758 |
| }, |
| { |
| "epoch": 6.487179487179487, |
| "grad_norm": 11.374105453491211, |
| "learning_rate": 6.756410256410257e-06, |
| "loss": 0.6909, |
| "step": 759 |
| }, |
| { |
| "epoch": 6.495726495726496, |
| "grad_norm": 11.613142967224121, |
| "learning_rate": 6.752136752136753e-06, |
| "loss": 0.6139, |
| "step": 760 |
| }, |
| { |
| "epoch": 6.504273504273504, |
| "grad_norm": 14.499147415161133, |
| "learning_rate": 6.747863247863249e-06, |
| "loss": 0.7242, |
| "step": 761 |
| }, |
| { |
| "epoch": 6.512820512820513, |
| "grad_norm": 13.683001518249512, |
| "learning_rate": 6.743589743589745e-06, |
| "loss": 0.9246, |
| "step": 762 |
| }, |
| { |
| "epoch": 6.521367521367521, |
| "grad_norm": 11.068865776062012, |
| "learning_rate": 6.739316239316241e-06, |
| "loss": 0.8866, |
| "step": 763 |
| }, |
| { |
| "epoch": 6.52991452991453, |
| "grad_norm": 13.0232572555542, |
| "learning_rate": 6.735042735042736e-06, |
| "loss": 0.86, |
| "step": 764 |
| }, |
| { |
| "epoch": 6.538461538461538, |
| "grad_norm": 10.639331817626953, |
| "learning_rate": 6.730769230769232e-06, |
| "loss": 0.6928, |
| "step": 765 |
| }, |
| { |
| "epoch": 6.547008547008547, |
| "grad_norm": 11.792994499206543, |
| "learning_rate": 6.7264957264957276e-06, |
| "loss": 0.6571, |
| "step": 766 |
| }, |
| { |
| "epoch": 6.555555555555555, |
| "grad_norm": 15.907414436340332, |
| "learning_rate": 6.7222222222222235e-06, |
| "loss": 1.1426, |
| "step": 767 |
| }, |
| { |
| "epoch": 6.564102564102564, |
| "grad_norm": 12.207514762878418, |
| "learning_rate": 6.717948717948718e-06, |
| "loss": 1.0932, |
| "step": 768 |
| }, |
| { |
| "epoch": 6.572649572649572, |
| "grad_norm": 20.145288467407227, |
| "learning_rate": 6.7136752136752135e-06, |
| "loss": 0.9706, |
| "step": 769 |
| }, |
| { |
| "epoch": 6.581196581196581, |
| "grad_norm": 9.820805549621582, |
| "learning_rate": 6.7094017094017094e-06, |
| "loss": 0.4955, |
| "step": 770 |
| }, |
| { |
| "epoch": 6.589743589743589, |
| "grad_norm": 10.385655403137207, |
| "learning_rate": 6.705128205128205e-06, |
| "loss": 1.0172, |
| "step": 771 |
| }, |
| { |
| "epoch": 6.598290598290598, |
| "grad_norm": 11.708373069763184, |
| "learning_rate": 6.700854700854701e-06, |
| "loss": 0.8048, |
| "step": 772 |
| }, |
| { |
| "epoch": 6.6068376068376065, |
| "grad_norm": 9.812984466552734, |
| "learning_rate": 6.696581196581196e-06, |
| "loss": 0.4831, |
| "step": 773 |
| }, |
| { |
| "epoch": 6.615384615384615, |
| "grad_norm": 9.146960258483887, |
| "learning_rate": 6.692307692307692e-06, |
| "loss": 0.6178, |
| "step": 774 |
| }, |
| { |
| "epoch": 6.6239316239316235, |
| "grad_norm": 13.61231517791748, |
| "learning_rate": 6.688034188034188e-06, |
| "loss": 0.7812, |
| "step": 775 |
| }, |
| { |
| "epoch": 6.632478632478632, |
| "grad_norm": 10.349262237548828, |
| "learning_rate": 6.683760683760684e-06, |
| "loss": 0.819, |
| "step": 776 |
| }, |
| { |
| "epoch": 6.641025641025641, |
| "grad_norm": 48.387847900390625, |
| "learning_rate": 6.67948717948718e-06, |
| "loss": 1.5294, |
| "step": 777 |
| }, |
| { |
| "epoch": 6.64957264957265, |
| "grad_norm": 9.540630340576172, |
| "learning_rate": 6.675213675213676e-06, |
| "loss": 0.6564, |
| "step": 778 |
| }, |
| { |
| "epoch": 6.6581196581196584, |
| "grad_norm": 10.83983039855957, |
| "learning_rate": 6.670940170940171e-06, |
| "loss": 0.5109, |
| "step": 779 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 15.380743026733398, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.6504, |
| "step": 780 |
| }, |
| { |
| "epoch": 6.6752136752136755, |
| "grad_norm": 16.796918869018555, |
| "learning_rate": 6.6623931623931625e-06, |
| "loss": 0.7944, |
| "step": 781 |
| }, |
| { |
| "epoch": 6.683760683760684, |
| "grad_norm": 39.64078140258789, |
| "learning_rate": 6.6581196581196584e-06, |
| "loss": 0.6929, |
| "step": 782 |
| }, |
| { |
| "epoch": 6.6923076923076925, |
| "grad_norm": 7.730568885803223, |
| "learning_rate": 6.653846153846154e-06, |
| "loss": 0.6284, |
| "step": 783 |
| }, |
| { |
| "epoch": 6.700854700854701, |
| "grad_norm": 7.840725898742676, |
| "learning_rate": 6.64957264957265e-06, |
| "loss": 0.5113, |
| "step": 784 |
| }, |
| { |
| "epoch": 6.7094017094017095, |
| "grad_norm": 13.925577163696289, |
| "learning_rate": 6.645299145299145e-06, |
| "loss": 0.6846, |
| "step": 785 |
| }, |
| { |
| "epoch": 6.717948717948718, |
| "grad_norm": 10.926531791687012, |
| "learning_rate": 6.641025641025641e-06, |
| "loss": 1.3245, |
| "step": 786 |
| }, |
| { |
| "epoch": 6.726495726495727, |
| "grad_norm": 10.698541641235352, |
| "learning_rate": 6.636752136752137e-06, |
| "loss": 0.6025, |
| "step": 787 |
| }, |
| { |
| "epoch": 6.735042735042735, |
| "grad_norm": 7.572136878967285, |
| "learning_rate": 6.632478632478633e-06, |
| "loss": 0.5473, |
| "step": 788 |
| }, |
| { |
| "epoch": 6.743589743589744, |
| "grad_norm": 26.242990493774414, |
| "learning_rate": 6.628205128205129e-06, |
| "loss": 0.5637, |
| "step": 789 |
| }, |
| { |
| "epoch": 6.752136752136752, |
| "grad_norm": 8.79776668548584, |
| "learning_rate": 6.623931623931624e-06, |
| "loss": 0.7595, |
| "step": 790 |
| }, |
| { |
| "epoch": 6.760683760683761, |
| "grad_norm": 8.951017379760742, |
| "learning_rate": 6.61965811965812e-06, |
| "loss": 1.0365, |
| "step": 791 |
| }, |
| { |
| "epoch": 6.769230769230769, |
| "grad_norm": 13.799118041992188, |
| "learning_rate": 6.615384615384616e-06, |
| "loss": 1.4206, |
| "step": 792 |
| }, |
| { |
| "epoch": 6.777777777777778, |
| "grad_norm": 674.3671875, |
| "learning_rate": 6.6111111111111115e-06, |
| "loss": 1.1752, |
| "step": 793 |
| }, |
| { |
| "epoch": 6.786324786324786, |
| "grad_norm": 8.110879898071289, |
| "learning_rate": 6.606837606837607e-06, |
| "loss": 0.4668, |
| "step": 794 |
| }, |
| { |
| "epoch": 6.794871794871795, |
| "grad_norm": 8.119854927062988, |
| "learning_rate": 6.602564102564103e-06, |
| "loss": 0.7689, |
| "step": 795 |
| }, |
| { |
| "epoch": 6.803418803418803, |
| "grad_norm": 11.039762496948242, |
| "learning_rate": 6.598290598290598e-06, |
| "loss": 0.5636, |
| "step": 796 |
| }, |
| { |
| "epoch": 6.811965811965812, |
| "grad_norm": 12.724084854125977, |
| "learning_rate": 6.594017094017094e-06, |
| "loss": 0.5072, |
| "step": 797 |
| }, |
| { |
| "epoch": 6.82051282051282, |
| "grad_norm": 12.196049690246582, |
| "learning_rate": 6.58974358974359e-06, |
| "loss": 0.5073, |
| "step": 798 |
| }, |
| { |
| "epoch": 6.829059829059829, |
| "grad_norm": 9.072951316833496, |
| "learning_rate": 6.585470085470086e-06, |
| "loss": 0.4855, |
| "step": 799 |
| }, |
| { |
| "epoch": 6.837606837606837, |
| "grad_norm": 10.53836441040039, |
| "learning_rate": 6.581196581196582e-06, |
| "loss": 1.0017, |
| "step": 800 |
| }, |
| { |
| "epoch": 6.846153846153846, |
| "grad_norm": 7.728690147399902, |
| "learning_rate": 6.576923076923078e-06, |
| "loss": 0.5784, |
| "step": 801 |
| }, |
| { |
| "epoch": 6.854700854700854, |
| "grad_norm": 28.362455368041992, |
| "learning_rate": 6.572649572649573e-06, |
| "loss": 1.0295, |
| "step": 802 |
| }, |
| { |
| "epoch": 6.863247863247864, |
| "grad_norm": 7.291123390197754, |
| "learning_rate": 6.568376068376069e-06, |
| "loss": 0.7836, |
| "step": 803 |
| }, |
| { |
| "epoch": 6.871794871794872, |
| "grad_norm": 9.566614151000977, |
| "learning_rate": 6.564102564102565e-06, |
| "loss": 0.9979, |
| "step": 804 |
| }, |
| { |
| "epoch": 6.880341880341881, |
| "grad_norm": 13.544408798217773, |
| "learning_rate": 6.5598290598290605e-06, |
| "loss": 0.5354, |
| "step": 805 |
| }, |
| { |
| "epoch": 6.888888888888889, |
| "grad_norm": 8.546881675720215, |
| "learning_rate": 6.555555555555556e-06, |
| "loss": 0.4689, |
| "step": 806 |
| }, |
| { |
| "epoch": 6.897435897435898, |
| "grad_norm": 8.94822883605957, |
| "learning_rate": 6.5512820512820515e-06, |
| "loss": 0.4432, |
| "step": 807 |
| }, |
| { |
| "epoch": 6.905982905982906, |
| "grad_norm": 6.5176544189453125, |
| "learning_rate": 6.547008547008547e-06, |
| "loss": 0.6747, |
| "step": 808 |
| }, |
| { |
| "epoch": 6.914529914529915, |
| "grad_norm": 9.48947811126709, |
| "learning_rate": 6.542735042735043e-06, |
| "loss": 0.4268, |
| "step": 809 |
| }, |
| { |
| "epoch": 6.923076923076923, |
| "grad_norm": 11.432586669921875, |
| "learning_rate": 6.538461538461539e-06, |
| "loss": 0.5486, |
| "step": 810 |
| }, |
| { |
| "epoch": 6.931623931623932, |
| "grad_norm": 7.585604667663574, |
| "learning_rate": 6.534188034188035e-06, |
| "loss": 0.4412, |
| "step": 811 |
| }, |
| { |
| "epoch": 6.94017094017094, |
| "grad_norm": 7.860292911529541, |
| "learning_rate": 6.529914529914531e-06, |
| "loss": 0.6428, |
| "step": 812 |
| }, |
| { |
| "epoch": 6.948717948717949, |
| "grad_norm": 27.83890151977539, |
| "learning_rate": 6.525641025641026e-06, |
| "loss": 0.6735, |
| "step": 813 |
| }, |
| { |
| "epoch": 6.957264957264957, |
| "grad_norm": 10.266451835632324, |
| "learning_rate": 6.521367521367522e-06, |
| "loss": 0.6757, |
| "step": 814 |
| }, |
| { |
| "epoch": 6.965811965811966, |
| "grad_norm": 8.839099884033203, |
| "learning_rate": 6.517094017094018e-06, |
| "loss": 0.7897, |
| "step": 815 |
| }, |
| { |
| "epoch": 6.9743589743589745, |
| "grad_norm": 10.037760734558105, |
| "learning_rate": 6.512820512820514e-06, |
| "loss": 0.7133, |
| "step": 816 |
| }, |
| { |
| "epoch": 6.982905982905983, |
| "grad_norm": 14.50278377532959, |
| "learning_rate": 6.5085470085470095e-06, |
| "loss": 1.0051, |
| "step": 817 |
| }, |
| { |
| "epoch": 6.9914529914529915, |
| "grad_norm": 8.775527000427246, |
| "learning_rate": 6.504273504273505e-06, |
| "loss": 0.8769, |
| "step": 818 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 8.891378402709961, |
| "learning_rate": 6.5000000000000004e-06, |
| "loss": 0.9586, |
| "step": 819 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.23673956096172333, |
| "eval_runtime": 9.3447, |
| "eval_samples_per_second": 49.868, |
| "eval_steps_per_second": 6.314, |
| "step": 819 |
| }, |
| { |
| "epoch": 7.0085470085470085, |
| "grad_norm": 8.925857543945312, |
| "learning_rate": 6.495726495726496e-06, |
| "loss": 1.0406, |
| "step": 820 |
| }, |
| { |
| "epoch": 7.017094017094017, |
| "grad_norm": 8.222796440124512, |
| "learning_rate": 6.491452991452992e-06, |
| "loss": 0.4911, |
| "step": 821 |
| }, |
| { |
| "epoch": 7.0256410256410255, |
| "grad_norm": 11.528886795043945, |
| "learning_rate": 6.487179487179488e-06, |
| "loss": 0.8292, |
| "step": 822 |
| }, |
| { |
| "epoch": 7.034188034188034, |
| "grad_norm": 7.9031524658203125, |
| "learning_rate": 6.482905982905984e-06, |
| "loss": 0.5319, |
| "step": 823 |
| }, |
| { |
| "epoch": 7.042735042735043, |
| "grad_norm": 6.788857936859131, |
| "learning_rate": 6.478632478632479e-06, |
| "loss": 0.431, |
| "step": 824 |
| }, |
| { |
| "epoch": 7.051282051282051, |
| "grad_norm": 8.84765911102295, |
| "learning_rate": 6.474358974358975e-06, |
| "loss": 0.6417, |
| "step": 825 |
| }, |
| { |
| "epoch": 7.05982905982906, |
| "grad_norm": 7.517561435699463, |
| "learning_rate": 6.470085470085471e-06, |
| "loss": 0.5828, |
| "step": 826 |
| }, |
| { |
| "epoch": 7.068376068376068, |
| "grad_norm": 9.86832332611084, |
| "learning_rate": 6.465811965811967e-06, |
| "loss": 0.5851, |
| "step": 827 |
| }, |
| { |
| "epoch": 7.076923076923077, |
| "grad_norm": 9.632494926452637, |
| "learning_rate": 6.461538461538463e-06, |
| "loss": 0.769, |
| "step": 828 |
| }, |
| { |
| "epoch": 7.085470085470085, |
| "grad_norm": 9.874857902526855, |
| "learning_rate": 6.4572649572649585e-06, |
| "loss": 0.4393, |
| "step": 829 |
| }, |
| { |
| "epoch": 7.094017094017094, |
| "grad_norm": 11.78085994720459, |
| "learning_rate": 6.4529914529914535e-06, |
| "loss": 0.8784, |
| "step": 830 |
| }, |
| { |
| "epoch": 7.102564102564102, |
| "grad_norm": 8.85053825378418, |
| "learning_rate": 6.4487179487179494e-06, |
| "loss": 0.5911, |
| "step": 831 |
| }, |
| { |
| "epoch": 7.111111111111111, |
| "grad_norm": 12.405013084411621, |
| "learning_rate": 6.444444444444445e-06, |
| "loss": 0.4941, |
| "step": 832 |
| }, |
| { |
| "epoch": 7.119658119658119, |
| "grad_norm": 12.237760543823242, |
| "learning_rate": 6.440170940170941e-06, |
| "loss": 0.4468, |
| "step": 833 |
| }, |
| { |
| "epoch": 7.128205128205128, |
| "grad_norm": 7.945899486541748, |
| "learning_rate": 6.435897435897437e-06, |
| "loss": 0.4101, |
| "step": 834 |
| }, |
| { |
| "epoch": 7.136752136752137, |
| "grad_norm": 10.743217468261719, |
| "learning_rate": 6.431623931623933e-06, |
| "loss": 0.679, |
| "step": 835 |
| }, |
| { |
| "epoch": 7.145299145299146, |
| "grad_norm": 7.700406551361084, |
| "learning_rate": 6.427350427350428e-06, |
| "loss": 0.5067, |
| "step": 836 |
| }, |
| { |
| "epoch": 7.153846153846154, |
| "grad_norm": 8.401918411254883, |
| "learning_rate": 6.423076923076924e-06, |
| "loss": 0.5893, |
| "step": 837 |
| }, |
| { |
| "epoch": 7.162393162393163, |
| "grad_norm": 23.065881729125977, |
| "learning_rate": 6.41880341880342e-06, |
| "loss": 0.6768, |
| "step": 838 |
| }, |
| { |
| "epoch": 7.170940170940171, |
| "grad_norm": 38.71855545043945, |
| "learning_rate": 6.414529914529916e-06, |
| "loss": 0.8828, |
| "step": 839 |
| }, |
| { |
| "epoch": 7.17948717948718, |
| "grad_norm": 12.142110824584961, |
| "learning_rate": 6.410256410256412e-06, |
| "loss": 0.5444, |
| "step": 840 |
| }, |
| { |
| "epoch": 7.188034188034188, |
| "grad_norm": 69.4731674194336, |
| "learning_rate": 6.405982905982906e-06, |
| "loss": 0.7768, |
| "step": 841 |
| }, |
| { |
| "epoch": 7.196581196581197, |
| "grad_norm": 15.926841735839844, |
| "learning_rate": 6.401709401709402e-06, |
| "loss": 0.4348, |
| "step": 842 |
| }, |
| { |
| "epoch": 7.205128205128205, |
| "grad_norm": 6.8418965339660645, |
| "learning_rate": 6.397435897435898e-06, |
| "loss": 0.3821, |
| "step": 843 |
| }, |
| { |
| "epoch": 7.213675213675214, |
| "grad_norm": 6.716574192047119, |
| "learning_rate": 6.3931623931623935e-06, |
| "loss": 0.3621, |
| "step": 844 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 7.452919006347656, |
| "learning_rate": 6.3888888888888885e-06, |
| "loss": 0.4997, |
| "step": 845 |
| }, |
| { |
| "epoch": 7.230769230769231, |
| "grad_norm": 11.502019882202148, |
| "learning_rate": 6.384615384615384e-06, |
| "loss": 0.8017, |
| "step": 846 |
| }, |
| { |
| "epoch": 7.239316239316239, |
| "grad_norm": 7.349746227264404, |
| "learning_rate": 6.38034188034188e-06, |
| "loss": 0.2745, |
| "step": 847 |
| }, |
| { |
| "epoch": 7.247863247863248, |
| "grad_norm": 6.269787311553955, |
| "learning_rate": 6.376068376068376e-06, |
| "loss": 0.4131, |
| "step": 848 |
| }, |
| { |
| "epoch": 7.256410256410256, |
| "grad_norm": 9.56203842163086, |
| "learning_rate": 6.371794871794872e-06, |
| "loss": 0.8147, |
| "step": 849 |
| }, |
| { |
| "epoch": 7.264957264957265, |
| "grad_norm": 7.358108043670654, |
| "learning_rate": 6.367521367521368e-06, |
| "loss": 0.3552, |
| "step": 850 |
| }, |
| { |
| "epoch": 7.273504273504273, |
| "grad_norm": 7.6359782218933105, |
| "learning_rate": 6.363247863247863e-06, |
| "loss": 0.3302, |
| "step": 851 |
| }, |
| { |
| "epoch": 7.282051282051282, |
| "grad_norm": 7.356925010681152, |
| "learning_rate": 6.358974358974359e-06, |
| "loss": 0.2927, |
| "step": 852 |
| }, |
| { |
| "epoch": 7.2905982905982905, |
| "grad_norm": 11.097757339477539, |
| "learning_rate": 6.354700854700855e-06, |
| "loss": 0.8117, |
| "step": 853 |
| }, |
| { |
| "epoch": 7.299145299145299, |
| "grad_norm": 10.301170349121094, |
| "learning_rate": 6.350427350427351e-06, |
| "loss": 0.4044, |
| "step": 854 |
| }, |
| { |
| "epoch": 7.3076923076923075, |
| "grad_norm": 7.116042613983154, |
| "learning_rate": 6.3461538461538466e-06, |
| "loss": 0.289, |
| "step": 855 |
| }, |
| { |
| "epoch": 7.316239316239316, |
| "grad_norm": 7.453964710235596, |
| "learning_rate": 6.3418803418803425e-06, |
| "loss": 0.4652, |
| "step": 856 |
| }, |
| { |
| "epoch": 7.3247863247863245, |
| "grad_norm": 11.864774703979492, |
| "learning_rate": 6.3376068376068375e-06, |
| "loss": 0.4667, |
| "step": 857 |
| }, |
| { |
| "epoch": 7.333333333333333, |
| "grad_norm": 8.79547119140625, |
| "learning_rate": 6.333333333333333e-06, |
| "loss": 0.2874, |
| "step": 858 |
| }, |
| { |
| "epoch": 7.3418803418803416, |
| "grad_norm": 10.173043251037598, |
| "learning_rate": 6.329059829059829e-06, |
| "loss": 0.6844, |
| "step": 859 |
| }, |
| { |
| "epoch": 7.35042735042735, |
| "grad_norm": 9.26555061340332, |
| "learning_rate": 6.324786324786325e-06, |
| "loss": 0.2903, |
| "step": 860 |
| }, |
| { |
| "epoch": 7.358974358974359, |
| "grad_norm": 10.274518013000488, |
| "learning_rate": 6.320512820512821e-06, |
| "loss": 0.7824, |
| "step": 861 |
| }, |
| { |
| "epoch": 7.367521367521368, |
| "grad_norm": 7.104451656341553, |
| "learning_rate": 6.316239316239316e-06, |
| "loss": 0.3024, |
| "step": 862 |
| }, |
| { |
| "epoch": 7.3760683760683765, |
| "grad_norm": 9.522738456726074, |
| "learning_rate": 6.311965811965812e-06, |
| "loss": 0.3219, |
| "step": 863 |
| }, |
| { |
| "epoch": 7.384615384615385, |
| "grad_norm": 10.145588874816895, |
| "learning_rate": 6.307692307692308e-06, |
| "loss": 0.5319, |
| "step": 864 |
| }, |
| { |
| "epoch": 7.3931623931623935, |
| "grad_norm": 8.828988075256348, |
| "learning_rate": 6.303418803418804e-06, |
| "loss": 0.3286, |
| "step": 865 |
| }, |
| { |
| "epoch": 7.401709401709402, |
| "grad_norm": 7.314462661743164, |
| "learning_rate": 6.2991452991453e-06, |
| "loss": 0.2951, |
| "step": 866 |
| }, |
| { |
| "epoch": 7.410256410256411, |
| "grad_norm": 13.465666770935059, |
| "learning_rate": 6.2948717948717956e-06, |
| "loss": 0.4046, |
| "step": 867 |
| }, |
| { |
| "epoch": 7.418803418803419, |
| "grad_norm": 12.40607738494873, |
| "learning_rate": 6.290598290598291e-06, |
| "loss": 0.71, |
| "step": 868 |
| }, |
| { |
| "epoch": 7.427350427350428, |
| "grad_norm": 9.282904624938965, |
| "learning_rate": 6.2863247863247865e-06, |
| "loss": 0.4083, |
| "step": 869 |
| }, |
| { |
| "epoch": 7.435897435897436, |
| "grad_norm": 5.755247116088867, |
| "learning_rate": 6.282051282051282e-06, |
| "loss": 0.3858, |
| "step": 870 |
| }, |
| { |
| "epoch": 7.444444444444445, |
| "grad_norm": 6.996497631072998, |
| "learning_rate": 6.277777777777778e-06, |
| "loss": 0.2692, |
| "step": 871 |
| }, |
| { |
| "epoch": 7.452991452991453, |
| "grad_norm": 7.235395431518555, |
| "learning_rate": 6.273504273504274e-06, |
| "loss": 0.3936, |
| "step": 872 |
| }, |
| { |
| "epoch": 7.461538461538462, |
| "grad_norm": 14.275704383850098, |
| "learning_rate": 6.26923076923077e-06, |
| "loss": 0.4022, |
| "step": 873 |
| }, |
| { |
| "epoch": 7.47008547008547, |
| "grad_norm": 10.365689277648926, |
| "learning_rate": 6.264957264957265e-06, |
| "loss": 1.0508, |
| "step": 874 |
| }, |
| { |
| "epoch": 7.478632478632479, |
| "grad_norm": 5.840590000152588, |
| "learning_rate": 6.260683760683761e-06, |
| "loss": 0.2511, |
| "step": 875 |
| }, |
| { |
| "epoch": 7.487179487179487, |
| "grad_norm": 10.25346851348877, |
| "learning_rate": 6.256410256410257e-06, |
| "loss": 0.5836, |
| "step": 876 |
| }, |
| { |
| "epoch": 7.495726495726496, |
| "grad_norm": 27.662694931030273, |
| "learning_rate": 6.252136752136753e-06, |
| "loss": 0.7677, |
| "step": 877 |
| }, |
| { |
| "epoch": 7.504273504273504, |
| "grad_norm": 5.840217590332031, |
| "learning_rate": 6.247863247863249e-06, |
| "loss": 0.3889, |
| "step": 878 |
| }, |
| { |
| "epoch": 7.512820512820513, |
| "grad_norm": 9.813179016113281, |
| "learning_rate": 6.243589743589744e-06, |
| "loss": 0.8929, |
| "step": 879 |
| }, |
| { |
| "epoch": 7.521367521367521, |
| "grad_norm": 5.49755334854126, |
| "learning_rate": 6.23931623931624e-06, |
| "loss": 0.2712, |
| "step": 880 |
| }, |
| { |
| "epoch": 7.52991452991453, |
| "grad_norm": 7.17311429977417, |
| "learning_rate": 6.2350427350427355e-06, |
| "loss": 0.3071, |
| "step": 881 |
| }, |
| { |
| "epoch": 7.538461538461538, |
| "grad_norm": 7.706870079040527, |
| "learning_rate": 6.230769230769231e-06, |
| "loss": 0.3797, |
| "step": 882 |
| }, |
| { |
| "epoch": 7.547008547008547, |
| "grad_norm": 7.891415596008301, |
| "learning_rate": 6.226495726495727e-06, |
| "loss": 0.5352, |
| "step": 883 |
| }, |
| { |
| "epoch": 7.555555555555555, |
| "grad_norm": 8.746044158935547, |
| "learning_rate": 6.222222222222223e-06, |
| "loss": 0.263, |
| "step": 884 |
| }, |
| { |
| "epoch": 7.564102564102564, |
| "grad_norm": 9.096441268920898, |
| "learning_rate": 6.217948717948718e-06, |
| "loss": 0.2736, |
| "step": 885 |
| }, |
| { |
| "epoch": 7.572649572649572, |
| "grad_norm": 7.031003475189209, |
| "learning_rate": 6.213675213675214e-06, |
| "loss": 0.4705, |
| "step": 886 |
| }, |
| { |
| "epoch": 7.581196581196581, |
| "grad_norm": 6.6503143310546875, |
| "learning_rate": 6.20940170940171e-06, |
| "loss": 0.3285, |
| "step": 887 |
| }, |
| { |
| "epoch": 7.589743589743589, |
| "grad_norm": 5.398913383483887, |
| "learning_rate": 6.205128205128206e-06, |
| "loss": 0.41, |
| "step": 888 |
| }, |
| { |
| "epoch": 7.598290598290598, |
| "grad_norm": 7.47569465637207, |
| "learning_rate": 6.200854700854702e-06, |
| "loss": 0.4005, |
| "step": 889 |
| }, |
| { |
| "epoch": 7.6068376068376065, |
| "grad_norm": 8.79906940460205, |
| "learning_rate": 6.196581196581198e-06, |
| "loss": 0.2608, |
| "step": 890 |
| }, |
| { |
| "epoch": 7.615384615384615, |
| "grad_norm": 7.604002475738525, |
| "learning_rate": 6.192307692307693e-06, |
| "loss": 0.577, |
| "step": 891 |
| }, |
| { |
| "epoch": 7.6239316239316235, |
| "grad_norm": 12.666848182678223, |
| "learning_rate": 6.188034188034189e-06, |
| "loss": 0.7296, |
| "step": 892 |
| }, |
| { |
| "epoch": 7.632478632478632, |
| "grad_norm": 20.92390251159668, |
| "learning_rate": 6.1837606837606845e-06, |
| "loss": 0.9276, |
| "step": 893 |
| }, |
| { |
| "epoch": 7.641025641025641, |
| "grad_norm": 6.779317855834961, |
| "learning_rate": 6.17948717948718e-06, |
| "loss": 0.818, |
| "step": 894 |
| }, |
| { |
| "epoch": 7.64957264957265, |
| "grad_norm": 5.249539852142334, |
| "learning_rate": 6.175213675213676e-06, |
| "loss": 0.2117, |
| "step": 895 |
| }, |
| { |
| "epoch": 7.6581196581196584, |
| "grad_norm": 23.55508041381836, |
| "learning_rate": 6.170940170940171e-06, |
| "loss": 0.5239, |
| "step": 896 |
| }, |
| { |
| "epoch": 7.666666666666667, |
| "grad_norm": 11.711256980895996, |
| "learning_rate": 6.166666666666667e-06, |
| "loss": 0.6595, |
| "step": 897 |
| }, |
| { |
| "epoch": 7.6752136752136755, |
| "grad_norm": 6.641115188598633, |
| "learning_rate": 6.162393162393163e-06, |
| "loss": 0.4888, |
| "step": 898 |
| }, |
| { |
| "epoch": 7.683760683760684, |
| "grad_norm": 7.913390159606934, |
| "learning_rate": 6.158119658119659e-06, |
| "loss": 0.66, |
| "step": 899 |
| }, |
| { |
| "epoch": 7.6923076923076925, |
| "grad_norm": 17.927574157714844, |
| "learning_rate": 6.153846153846155e-06, |
| "loss": 0.9603, |
| "step": 900 |
| }, |
| { |
| "epoch": 7.700854700854701, |
| "grad_norm": 4.567203998565674, |
| "learning_rate": 6.149572649572651e-06, |
| "loss": 0.1638, |
| "step": 901 |
| }, |
| { |
| "epoch": 7.7094017094017095, |
| "grad_norm": 5.995935440063477, |
| "learning_rate": 6.145299145299146e-06, |
| "loss": 0.6852, |
| "step": 902 |
| }, |
| { |
| "epoch": 7.717948717948718, |
| "grad_norm": 8.323802947998047, |
| "learning_rate": 6.141025641025642e-06, |
| "loss": 0.5293, |
| "step": 903 |
| }, |
| { |
| "epoch": 7.726495726495727, |
| "grad_norm": 6.8586859703063965, |
| "learning_rate": 6.136752136752138e-06, |
| "loss": 0.3265, |
| "step": 904 |
| }, |
| { |
| "epoch": 7.735042735042735, |
| "grad_norm": 6.507427215576172, |
| "learning_rate": 6.1324786324786335e-06, |
| "loss": 0.2841, |
| "step": 905 |
| }, |
| { |
| "epoch": 7.743589743589744, |
| "grad_norm": 6.789999485015869, |
| "learning_rate": 6.128205128205129e-06, |
| "loss": 0.4236, |
| "step": 906 |
| }, |
| { |
| "epoch": 7.752136752136752, |
| "grad_norm": 19.444454193115234, |
| "learning_rate": 6.123931623931625e-06, |
| "loss": 0.2829, |
| "step": 907 |
| }, |
| { |
| "epoch": 7.760683760683761, |
| "grad_norm": 31.564800262451172, |
| "learning_rate": 6.11965811965812e-06, |
| "loss": 1.093, |
| "step": 908 |
| }, |
| { |
| "epoch": 7.769230769230769, |
| "grad_norm": 9.956007957458496, |
| "learning_rate": 6.115384615384616e-06, |
| "loss": 0.6749, |
| "step": 909 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 5.193087577819824, |
| "learning_rate": 6.111111111111112e-06, |
| "loss": 0.1986, |
| "step": 910 |
| }, |
| { |
| "epoch": 7.786324786324786, |
| "grad_norm": 4.792945384979248, |
| "learning_rate": 6.106837606837608e-06, |
| "loss": 0.5179, |
| "step": 911 |
| }, |
| { |
| "epoch": 7.794871794871795, |
| "grad_norm": 20.602317810058594, |
| "learning_rate": 6.102564102564104e-06, |
| "loss": 1.0343, |
| "step": 912 |
| }, |
| { |
| "epoch": 7.803418803418803, |
| "grad_norm": 22.205543518066406, |
| "learning_rate": 6.098290598290599e-06, |
| "loss": 0.4921, |
| "step": 913 |
| }, |
| { |
| "epoch": 7.811965811965812, |
| "grad_norm": 13.392712593078613, |
| "learning_rate": 6.094017094017095e-06, |
| "loss": 0.9058, |
| "step": 914 |
| }, |
| { |
| "epoch": 7.82051282051282, |
| "grad_norm": 6.262679100036621, |
| "learning_rate": 6.08974358974359e-06, |
| "loss": 0.3877, |
| "step": 915 |
| }, |
| { |
| "epoch": 7.829059829059829, |
| "grad_norm": 12.727428436279297, |
| "learning_rate": 6.085470085470086e-06, |
| "loss": 0.4477, |
| "step": 916 |
| }, |
| { |
| "epoch": 7.837606837606837, |
| "grad_norm": 6.595224380493164, |
| "learning_rate": 6.081196581196581e-06, |
| "loss": 0.5553, |
| "step": 917 |
| }, |
| { |
| "epoch": 7.846153846153846, |
| "grad_norm": 6.815043926239014, |
| "learning_rate": 6.076923076923077e-06, |
| "loss": 0.2978, |
| "step": 918 |
| }, |
| { |
| "epoch": 7.854700854700854, |
| "grad_norm": 11.751949310302734, |
| "learning_rate": 6.0726495726495726e-06, |
| "loss": 0.5509, |
| "step": 919 |
| }, |
| { |
| "epoch": 7.863247863247864, |
| "grad_norm": 6.067570209503174, |
| "learning_rate": 6.0683760683760684e-06, |
| "loss": 0.475, |
| "step": 920 |
| }, |
| { |
| "epoch": 7.871794871794872, |
| "grad_norm": 7.4297919273376465, |
| "learning_rate": 6.064102564102564e-06, |
| "loss": 0.5073, |
| "step": 921 |
| }, |
| { |
| "epoch": 7.880341880341881, |
| "grad_norm": 6.778268337249756, |
| "learning_rate": 6.05982905982906e-06, |
| "loss": 0.4718, |
| "step": 922 |
| }, |
| { |
| "epoch": 7.888888888888889, |
| "grad_norm": 9.401915550231934, |
| "learning_rate": 6.055555555555555e-06, |
| "loss": 0.7151, |
| "step": 923 |
| }, |
| { |
| "epoch": 7.897435897435898, |
| "grad_norm": 6.359888553619385, |
| "learning_rate": 6.051282051282051e-06, |
| "loss": 0.3175, |
| "step": 924 |
| }, |
| { |
| "epoch": 7.905982905982906, |
| "grad_norm": 7.036016464233398, |
| "learning_rate": 6.047008547008547e-06, |
| "loss": 0.3172, |
| "step": 925 |
| }, |
| { |
| "epoch": 7.914529914529915, |
| "grad_norm": 5.980124473571777, |
| "learning_rate": 6.042735042735043e-06, |
| "loss": 0.2949, |
| "step": 926 |
| }, |
| { |
| "epoch": 7.923076923076923, |
| "grad_norm": 5.738795280456543, |
| "learning_rate": 6.038461538461539e-06, |
| "loss": 0.2454, |
| "step": 927 |
| }, |
| { |
| "epoch": 7.931623931623932, |
| "grad_norm": 4.688748359680176, |
| "learning_rate": 6.034188034188035e-06, |
| "loss": 0.1949, |
| "step": 928 |
| }, |
| { |
| "epoch": 7.94017094017094, |
| "grad_norm": 7.2333984375, |
| "learning_rate": 6.02991452991453e-06, |
| "loss": 0.2174, |
| "step": 929 |
| }, |
| { |
| "epoch": 7.948717948717949, |
| "grad_norm": 6.005523204803467, |
| "learning_rate": 6.025641025641026e-06, |
| "loss": 0.4216, |
| "step": 930 |
| }, |
| { |
| "epoch": 7.957264957264957, |
| "grad_norm": 6.017541885375977, |
| "learning_rate": 6.0213675213675215e-06, |
| "loss": 0.4904, |
| "step": 931 |
| }, |
| { |
| "epoch": 7.965811965811966, |
| "grad_norm": 19.559003829956055, |
| "learning_rate": 6.0170940170940174e-06, |
| "loss": 0.2616, |
| "step": 932 |
| }, |
| { |
| "epoch": 7.9743589743589745, |
| "grad_norm": 5.360724449157715, |
| "learning_rate": 6.012820512820513e-06, |
| "loss": 0.3629, |
| "step": 933 |
| }, |
| { |
| "epoch": 7.982905982905983, |
| "grad_norm": 9.472721099853516, |
| "learning_rate": 6.008547008547008e-06, |
| "loss": 0.5044, |
| "step": 934 |
| }, |
| { |
| "epoch": 7.9914529914529915, |
| "grad_norm": 6.453597068786621, |
| "learning_rate": 6.004273504273504e-06, |
| "loss": 0.4742, |
| "step": 935 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 7.647386074066162, |
| "learning_rate": 6e-06, |
| "loss": 0.402, |
| "step": 936 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.1672903448343277, |
| "eval_runtime": 9.3047, |
| "eval_samples_per_second": 50.082, |
| "eval_steps_per_second": 6.341, |
| "step": 936 |
| }, |
| { |
| "epoch": 8.008547008547009, |
| "grad_norm": 5.8361663818359375, |
| "learning_rate": 5.995726495726496e-06, |
| "loss": 0.164, |
| "step": 937 |
| }, |
| { |
| "epoch": 8.017094017094017, |
| "grad_norm": 5.801360130310059, |
| "learning_rate": 5.991452991452992e-06, |
| "loss": 0.2858, |
| "step": 938 |
| }, |
| { |
| "epoch": 8.025641025641026, |
| "grad_norm": 4.43051290512085, |
| "learning_rate": 5.987179487179488e-06, |
| "loss": 0.2068, |
| "step": 939 |
| }, |
| { |
| "epoch": 8.034188034188034, |
| "grad_norm": 6.544061660766602, |
| "learning_rate": 5.982905982905983e-06, |
| "loss": 0.3499, |
| "step": 940 |
| }, |
| { |
| "epoch": 8.042735042735043, |
| "grad_norm": 5.500844955444336, |
| "learning_rate": 5.978632478632479e-06, |
| "loss": 0.3134, |
| "step": 941 |
| }, |
| { |
| "epoch": 8.051282051282051, |
| "grad_norm": 4.286651611328125, |
| "learning_rate": 5.974358974358975e-06, |
| "loss": 0.1767, |
| "step": 942 |
| }, |
| { |
| "epoch": 8.05982905982906, |
| "grad_norm": 13.860437393188477, |
| "learning_rate": 5.9700854700854705e-06, |
| "loss": 0.3913, |
| "step": 943 |
| }, |
| { |
| "epoch": 8.068376068376068, |
| "grad_norm": 5.998767852783203, |
| "learning_rate": 5.9658119658119664e-06, |
| "loss": 0.2275, |
| "step": 944 |
| }, |
| { |
| "epoch": 8.076923076923077, |
| "grad_norm": 9.01196002960205, |
| "learning_rate": 5.961538461538462e-06, |
| "loss": 0.5202, |
| "step": 945 |
| }, |
| { |
| "epoch": 8.085470085470085, |
| "grad_norm": 6.81577730178833, |
| "learning_rate": 5.957264957264957e-06, |
| "loss": 0.5923, |
| "step": 946 |
| }, |
| { |
| "epoch": 8.094017094017094, |
| "grad_norm": 7.400684833526611, |
| "learning_rate": 5.952991452991453e-06, |
| "loss": 0.2883, |
| "step": 947 |
| }, |
| { |
| "epoch": 8.102564102564102, |
| "grad_norm": 16.18587875366211, |
| "learning_rate": 5.948717948717949e-06, |
| "loss": 0.3377, |
| "step": 948 |
| }, |
| { |
| "epoch": 8.11111111111111, |
| "grad_norm": 5.017345428466797, |
| "learning_rate": 5.944444444444445e-06, |
| "loss": 0.3912, |
| "step": 949 |
| }, |
| { |
| "epoch": 8.11965811965812, |
| "grad_norm": 5.300196647644043, |
| "learning_rate": 5.940170940170941e-06, |
| "loss": 0.4056, |
| "step": 950 |
| }, |
| { |
| "epoch": 8.128205128205128, |
| "grad_norm": 6.3473405838012695, |
| "learning_rate": 5.935897435897436e-06, |
| "loss": 0.2559, |
| "step": 951 |
| }, |
| { |
| "epoch": 8.136752136752136, |
| "grad_norm": 12.37689208984375, |
| "learning_rate": 5.931623931623932e-06, |
| "loss": 0.2216, |
| "step": 952 |
| }, |
| { |
| "epoch": 8.145299145299145, |
| "grad_norm": 5.573046684265137, |
| "learning_rate": 5.927350427350428e-06, |
| "loss": 0.2047, |
| "step": 953 |
| }, |
| { |
| "epoch": 8.153846153846153, |
| "grad_norm": 5.033559322357178, |
| "learning_rate": 5.923076923076924e-06, |
| "loss": 0.3661, |
| "step": 954 |
| }, |
| { |
| "epoch": 8.162393162393162, |
| "grad_norm": 5.341614246368408, |
| "learning_rate": 5.9188034188034195e-06, |
| "loss": 0.2597, |
| "step": 955 |
| }, |
| { |
| "epoch": 8.17094017094017, |
| "grad_norm": 8.67937183380127, |
| "learning_rate": 5.914529914529915e-06, |
| "loss": 0.4098, |
| "step": 956 |
| }, |
| { |
| "epoch": 8.179487179487179, |
| "grad_norm": 3.957489252090454, |
| "learning_rate": 5.9102564102564105e-06, |
| "loss": 0.18, |
| "step": 957 |
| }, |
| { |
| "epoch": 8.188034188034187, |
| "grad_norm": 6.377108573913574, |
| "learning_rate": 5.905982905982906e-06, |
| "loss": 0.3414, |
| "step": 958 |
| }, |
| { |
| "epoch": 8.196581196581196, |
| "grad_norm": 8.621227264404297, |
| "learning_rate": 5.901709401709402e-06, |
| "loss": 1.1625, |
| "step": 959 |
| }, |
| { |
| "epoch": 8.205128205128204, |
| "grad_norm": 5.775392532348633, |
| "learning_rate": 5.897435897435898e-06, |
| "loss": 0.4283, |
| "step": 960 |
| }, |
| { |
| "epoch": 8.213675213675213, |
| "grad_norm": 4.522337913513184, |
| "learning_rate": 5.893162393162394e-06, |
| "loss": 0.3432, |
| "step": 961 |
| }, |
| { |
| "epoch": 8.222222222222221, |
| "grad_norm": 5.594667434692383, |
| "learning_rate": 5.88888888888889e-06, |
| "loss": 0.5212, |
| "step": 962 |
| }, |
| { |
| "epoch": 8.23076923076923, |
| "grad_norm": 5.478531837463379, |
| "learning_rate": 5.884615384615385e-06, |
| "loss": 0.2273, |
| "step": 963 |
| }, |
| { |
| "epoch": 8.239316239316238, |
| "grad_norm": 6.08770751953125, |
| "learning_rate": 5.880341880341881e-06, |
| "loss": 0.2673, |
| "step": 964 |
| }, |
| { |
| "epoch": 8.247863247863247, |
| "grad_norm": 7.962898254394531, |
| "learning_rate": 5.876068376068377e-06, |
| "loss": 0.2654, |
| "step": 965 |
| }, |
| { |
| "epoch": 8.256410256410255, |
| "grad_norm": 6.443154335021973, |
| "learning_rate": 5.871794871794873e-06, |
| "loss": 0.2982, |
| "step": 966 |
| }, |
| { |
| "epoch": 8.264957264957266, |
| "grad_norm": 4.689123153686523, |
| "learning_rate": 5.8675213675213685e-06, |
| "loss": 0.3459, |
| "step": 967 |
| }, |
| { |
| "epoch": 8.273504273504274, |
| "grad_norm": 5.446859359741211, |
| "learning_rate": 5.863247863247864e-06, |
| "loss": 0.2792, |
| "step": 968 |
| }, |
| { |
| "epoch": 8.282051282051283, |
| "grad_norm": 5.562478542327881, |
| "learning_rate": 5.8589743589743595e-06, |
| "loss": 0.1939, |
| "step": 969 |
| }, |
| { |
| "epoch": 8.290598290598291, |
| "grad_norm": 4.726650714874268, |
| "learning_rate": 5.854700854700855e-06, |
| "loss": 0.1368, |
| "step": 970 |
| }, |
| { |
| "epoch": 8.2991452991453, |
| "grad_norm": 17.44293785095215, |
| "learning_rate": 5.850427350427351e-06, |
| "loss": 0.3836, |
| "step": 971 |
| }, |
| { |
| "epoch": 8.307692307692308, |
| "grad_norm": 5.568243980407715, |
| "learning_rate": 5.846153846153847e-06, |
| "loss": 0.3674, |
| "step": 972 |
| }, |
| { |
| "epoch": 8.316239316239317, |
| "grad_norm": 3.488147258758545, |
| "learning_rate": 5.841880341880343e-06, |
| "loss": 0.197, |
| "step": 973 |
| }, |
| { |
| "epoch": 8.324786324786325, |
| "grad_norm": 15.902129173278809, |
| "learning_rate": 5.837606837606838e-06, |
| "loss": 0.4199, |
| "step": 974 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 8.055335998535156, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 0.277, |
| "step": 975 |
| }, |
| { |
| "epoch": 8.341880341880342, |
| "grad_norm": 8.122756004333496, |
| "learning_rate": 5.82905982905983e-06, |
| "loss": 0.5572, |
| "step": 976 |
| }, |
| { |
| "epoch": 8.350427350427351, |
| "grad_norm": 5.7439961433410645, |
| "learning_rate": 5.824786324786326e-06, |
| "loss": 0.2031, |
| "step": 977 |
| }, |
| { |
| "epoch": 8.35897435897436, |
| "grad_norm": 4.329511642456055, |
| "learning_rate": 5.820512820512822e-06, |
| "loss": 0.4405, |
| "step": 978 |
| }, |
| { |
| "epoch": 8.367521367521368, |
| "grad_norm": 10.946788787841797, |
| "learning_rate": 5.8162393162393175e-06, |
| "loss": 0.4619, |
| "step": 979 |
| }, |
| { |
| "epoch": 8.376068376068377, |
| "grad_norm": 6.0579352378845215, |
| "learning_rate": 5.8119658119658126e-06, |
| "loss": 0.4679, |
| "step": 980 |
| }, |
| { |
| "epoch": 8.384615384615385, |
| "grad_norm": 5.656944751739502, |
| "learning_rate": 5.8076923076923084e-06, |
| "loss": 0.2395, |
| "step": 981 |
| }, |
| { |
| "epoch": 8.393162393162394, |
| "grad_norm": 5.344303607940674, |
| "learning_rate": 5.803418803418804e-06, |
| "loss": 0.2516, |
| "step": 982 |
| }, |
| { |
| "epoch": 8.401709401709402, |
| "grad_norm": 7.070309638977051, |
| "learning_rate": 5.7991452991453e-06, |
| "loss": 0.3169, |
| "step": 983 |
| }, |
| { |
| "epoch": 8.41025641025641, |
| "grad_norm": 5.168705940246582, |
| "learning_rate": 5.794871794871796e-06, |
| "loss": 0.3007, |
| "step": 984 |
| }, |
| { |
| "epoch": 8.418803418803419, |
| "grad_norm": 3.556293249130249, |
| "learning_rate": 5.790598290598292e-06, |
| "loss": 0.2089, |
| "step": 985 |
| }, |
| { |
| "epoch": 8.427350427350428, |
| "grad_norm": 4.943065166473389, |
| "learning_rate": 5.786324786324787e-06, |
| "loss": 0.2093, |
| "step": 986 |
| }, |
| { |
| "epoch": 8.435897435897436, |
| "grad_norm": 6.991105556488037, |
| "learning_rate": 5.782051282051283e-06, |
| "loss": 0.4671, |
| "step": 987 |
| }, |
| { |
| "epoch": 8.444444444444445, |
| "grad_norm": 5.276190280914307, |
| "learning_rate": 5.777777777777778e-06, |
| "loss": 0.2092, |
| "step": 988 |
| }, |
| { |
| "epoch": 8.452991452991453, |
| "grad_norm": 77.91864776611328, |
| "learning_rate": 5.773504273504273e-06, |
| "loss": 1.7536, |
| "step": 989 |
| }, |
| { |
| "epoch": 8.461538461538462, |
| "grad_norm": 4.864828109741211, |
| "learning_rate": 5.769230769230769e-06, |
| "loss": 0.1669, |
| "step": 990 |
| }, |
| { |
| "epoch": 8.47008547008547, |
| "grad_norm": 4.416967391967773, |
| "learning_rate": 5.764957264957265e-06, |
| "loss": 0.2705, |
| "step": 991 |
| }, |
| { |
| "epoch": 8.478632478632479, |
| "grad_norm": 4.558652400970459, |
| "learning_rate": 5.760683760683761e-06, |
| "loss": 0.4332, |
| "step": 992 |
| }, |
| { |
| "epoch": 8.487179487179487, |
| "grad_norm": 8.17482852935791, |
| "learning_rate": 5.756410256410257e-06, |
| "loss": 0.7286, |
| "step": 993 |
| }, |
| { |
| "epoch": 8.495726495726496, |
| "grad_norm": 7.322425365447998, |
| "learning_rate": 5.7521367521367525e-06, |
| "loss": 0.8554, |
| "step": 994 |
| }, |
| { |
| "epoch": 8.504273504273504, |
| "grad_norm": 4.249075889587402, |
| "learning_rate": 5.7478632478632475e-06, |
| "loss": 0.2442, |
| "step": 995 |
| }, |
| { |
| "epoch": 8.512820512820513, |
| "grad_norm": 4.157267093658447, |
| "learning_rate": 5.743589743589743e-06, |
| "loss": 0.4207, |
| "step": 996 |
| }, |
| { |
| "epoch": 8.521367521367521, |
| "grad_norm": 4.118504047393799, |
| "learning_rate": 5.739316239316239e-06, |
| "loss": 0.1411, |
| "step": 997 |
| }, |
| { |
| "epoch": 8.52991452991453, |
| "grad_norm": 7.273322105407715, |
| "learning_rate": 5.735042735042735e-06, |
| "loss": 0.6269, |
| "step": 998 |
| }, |
| { |
| "epoch": 8.538461538461538, |
| "grad_norm": 4.7668633460998535, |
| "learning_rate": 5.730769230769231e-06, |
| "loss": 0.1894, |
| "step": 999 |
| }, |
| { |
| "epoch": 8.547008547008547, |
| "grad_norm": 5.869007110595703, |
| "learning_rate": 5.726495726495727e-06, |
| "loss": 0.7301, |
| "step": 1000 |
| }, |
| { |
| "epoch": 8.555555555555555, |
| "grad_norm": 5.987617015838623, |
| "learning_rate": 5.722222222222222e-06, |
| "loss": 0.29, |
| "step": 1001 |
| }, |
| { |
| "epoch": 8.564102564102564, |
| "grad_norm": 5.445812702178955, |
| "learning_rate": 5.717948717948718e-06, |
| "loss": 0.4278, |
| "step": 1002 |
| }, |
| { |
| "epoch": 8.572649572649572, |
| "grad_norm": 4.7509002685546875, |
| "learning_rate": 5.713675213675214e-06, |
| "loss": 0.3396, |
| "step": 1003 |
| }, |
| { |
| "epoch": 8.581196581196581, |
| "grad_norm": 5.584397315979004, |
| "learning_rate": 5.70940170940171e-06, |
| "loss": 0.1329, |
| "step": 1004 |
| }, |
| { |
| "epoch": 8.58974358974359, |
| "grad_norm": 4.627229690551758, |
| "learning_rate": 5.705128205128206e-06, |
| "loss": 0.3012, |
| "step": 1005 |
| }, |
| { |
| "epoch": 8.598290598290598, |
| "grad_norm": 7.724045276641846, |
| "learning_rate": 5.7008547008547015e-06, |
| "loss": 0.4876, |
| "step": 1006 |
| }, |
| { |
| "epoch": 8.606837606837606, |
| "grad_norm": 3.488499164581299, |
| "learning_rate": 5.6965811965811965e-06, |
| "loss": 0.2025, |
| "step": 1007 |
| }, |
| { |
| "epoch": 8.615384615384615, |
| "grad_norm": 14.487537384033203, |
| "learning_rate": 5.692307692307692e-06, |
| "loss": 0.6795, |
| "step": 1008 |
| }, |
| { |
| "epoch": 8.623931623931623, |
| "grad_norm": 4.03059196472168, |
| "learning_rate": 5.688034188034188e-06, |
| "loss": 0.2121, |
| "step": 1009 |
| }, |
| { |
| "epoch": 8.632478632478632, |
| "grad_norm": 3.278873920440674, |
| "learning_rate": 5.683760683760684e-06, |
| "loss": 0.3475, |
| "step": 1010 |
| }, |
| { |
| "epoch": 8.64102564102564, |
| "grad_norm": 4.599937915802002, |
| "learning_rate": 5.67948717948718e-06, |
| "loss": 0.2355, |
| "step": 1011 |
| }, |
| { |
| "epoch": 8.649572649572649, |
| "grad_norm": 6.314788818359375, |
| "learning_rate": 5.675213675213675e-06, |
| "loss": 0.2402, |
| "step": 1012 |
| }, |
| { |
| "epoch": 8.658119658119658, |
| "grad_norm": 3.4483532905578613, |
| "learning_rate": 5.670940170940171e-06, |
| "loss": 0.2189, |
| "step": 1013 |
| }, |
| { |
| "epoch": 8.666666666666666, |
| "grad_norm": 299.8923645019531, |
| "learning_rate": 5.666666666666667e-06, |
| "loss": 1.0473, |
| "step": 1014 |
| }, |
| { |
| "epoch": 8.675213675213675, |
| "grad_norm": 13.14855670928955, |
| "learning_rate": 5.662393162393163e-06, |
| "loss": 0.3723, |
| "step": 1015 |
| }, |
| { |
| "epoch": 8.683760683760683, |
| "grad_norm": 6.513180732727051, |
| "learning_rate": 5.658119658119659e-06, |
| "loss": 0.483, |
| "step": 1016 |
| }, |
| { |
| "epoch": 8.692307692307692, |
| "grad_norm": 5.026037693023682, |
| "learning_rate": 5.6538461538461546e-06, |
| "loss": 0.4417, |
| "step": 1017 |
| }, |
| { |
| "epoch": 8.7008547008547, |
| "grad_norm": 176.535888671875, |
| "learning_rate": 5.64957264957265e-06, |
| "loss": 0.5256, |
| "step": 1018 |
| }, |
| { |
| "epoch": 8.709401709401709, |
| "grad_norm": 6.023639678955078, |
| "learning_rate": 5.6452991452991455e-06, |
| "loss": 0.3708, |
| "step": 1019 |
| }, |
| { |
| "epoch": 8.717948717948717, |
| "grad_norm": 16.64018440246582, |
| "learning_rate": 5.641025641025641e-06, |
| "loss": 0.8908, |
| "step": 1020 |
| }, |
| { |
| "epoch": 8.726495726495726, |
| "grad_norm": 2.9167582988739014, |
| "learning_rate": 5.636752136752137e-06, |
| "loss": 0.077, |
| "step": 1021 |
| }, |
| { |
| "epoch": 8.735042735042736, |
| "grad_norm": 3.368325710296631, |
| "learning_rate": 5.632478632478633e-06, |
| "loss": 0.2495, |
| "step": 1022 |
| }, |
| { |
| "epoch": 8.743589743589745, |
| "grad_norm": 3.7961905002593994, |
| "learning_rate": 5.628205128205129e-06, |
| "loss": 0.4427, |
| "step": 1023 |
| }, |
| { |
| "epoch": 8.752136752136753, |
| "grad_norm": 4.661024570465088, |
| "learning_rate": 5.623931623931624e-06, |
| "loss": 0.3092, |
| "step": 1024 |
| }, |
| { |
| "epoch": 8.760683760683762, |
| "grad_norm": 5.1971588134765625, |
| "learning_rate": 5.61965811965812e-06, |
| "loss": 0.2213, |
| "step": 1025 |
| }, |
| { |
| "epoch": 8.76923076923077, |
| "grad_norm": 4.427041530609131, |
| "learning_rate": 5.615384615384616e-06, |
| "loss": 0.2885, |
| "step": 1026 |
| }, |
| { |
| "epoch": 8.777777777777779, |
| "grad_norm": 7.352906703948975, |
| "learning_rate": 5.611111111111112e-06, |
| "loss": 0.2689, |
| "step": 1027 |
| }, |
| { |
| "epoch": 8.786324786324787, |
| "grad_norm": 5.306934833526611, |
| "learning_rate": 5.606837606837608e-06, |
| "loss": 0.3758, |
| "step": 1028 |
| }, |
| { |
| "epoch": 8.794871794871796, |
| "grad_norm": 4.502418041229248, |
| "learning_rate": 5.602564102564103e-06, |
| "loss": 0.4655, |
| "step": 1029 |
| }, |
| { |
| "epoch": 8.803418803418804, |
| "grad_norm": 3.427734851837158, |
| "learning_rate": 5.598290598290599e-06, |
| "loss": 0.1145, |
| "step": 1030 |
| }, |
| { |
| "epoch": 8.811965811965813, |
| "grad_norm": 4.047433376312256, |
| "learning_rate": 5.5940170940170945e-06, |
| "loss": 0.1482, |
| "step": 1031 |
| }, |
| { |
| "epoch": 8.820512820512821, |
| "grad_norm": 3.6860435009002686, |
| "learning_rate": 5.58974358974359e-06, |
| "loss": 0.1152, |
| "step": 1032 |
| }, |
| { |
| "epoch": 8.82905982905983, |
| "grad_norm": 6.792733669281006, |
| "learning_rate": 5.585470085470086e-06, |
| "loss": 0.1732, |
| "step": 1033 |
| }, |
| { |
| "epoch": 8.837606837606838, |
| "grad_norm": 4.222206115722656, |
| "learning_rate": 5.581196581196582e-06, |
| "loss": 0.1259, |
| "step": 1034 |
| }, |
| { |
| "epoch": 8.846153846153847, |
| "grad_norm": 4.376220703125, |
| "learning_rate": 5.576923076923077e-06, |
| "loss": 0.2403, |
| "step": 1035 |
| }, |
| { |
| "epoch": 8.854700854700855, |
| "grad_norm": 3.459076166152954, |
| "learning_rate": 5.572649572649573e-06, |
| "loss": 0.2064, |
| "step": 1036 |
| }, |
| { |
| "epoch": 8.863247863247864, |
| "grad_norm": 6.312697410583496, |
| "learning_rate": 5.568376068376069e-06, |
| "loss": 0.5076, |
| "step": 1037 |
| }, |
| { |
| "epoch": 8.871794871794872, |
| "grad_norm": 10.137848854064941, |
| "learning_rate": 5.564102564102565e-06, |
| "loss": 0.1649, |
| "step": 1038 |
| }, |
| { |
| "epoch": 8.88034188034188, |
| "grad_norm": 6.605007171630859, |
| "learning_rate": 5.559829059829061e-06, |
| "loss": 0.4233, |
| "step": 1039 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 3.9786465167999268, |
| "learning_rate": 5.555555555555557e-06, |
| "loss": 0.1801, |
| "step": 1040 |
| }, |
| { |
| "epoch": 8.897435897435898, |
| "grad_norm": 4.40491247177124, |
| "learning_rate": 5.551282051282052e-06, |
| "loss": 0.169, |
| "step": 1041 |
| }, |
| { |
| "epoch": 8.905982905982906, |
| "grad_norm": 4.719818592071533, |
| "learning_rate": 5.547008547008548e-06, |
| "loss": 0.1454, |
| "step": 1042 |
| }, |
| { |
| "epoch": 8.914529914529915, |
| "grad_norm": 2.384941577911377, |
| "learning_rate": 5.5427350427350435e-06, |
| "loss": 0.0723, |
| "step": 1043 |
| }, |
| { |
| "epoch": 8.923076923076923, |
| "grad_norm": 3.258315324783325, |
| "learning_rate": 5.538461538461539e-06, |
| "loss": 0.1023, |
| "step": 1044 |
| }, |
| { |
| "epoch": 8.931623931623932, |
| "grad_norm": 18.745052337646484, |
| "learning_rate": 5.534188034188035e-06, |
| "loss": 0.2673, |
| "step": 1045 |
| }, |
| { |
| "epoch": 8.94017094017094, |
| "grad_norm": 3.788177967071533, |
| "learning_rate": 5.52991452991453e-06, |
| "loss": 0.3173, |
| "step": 1046 |
| }, |
| { |
| "epoch": 8.948717948717949, |
| "grad_norm": 2.734895944595337, |
| "learning_rate": 5.525641025641026e-06, |
| "loss": 0.0834, |
| "step": 1047 |
| }, |
| { |
| "epoch": 8.957264957264957, |
| "grad_norm": 4.158284664154053, |
| "learning_rate": 5.521367521367522e-06, |
| "loss": 0.3414, |
| "step": 1048 |
| }, |
| { |
| "epoch": 8.965811965811966, |
| "grad_norm": 4.875148296356201, |
| "learning_rate": 5.517094017094018e-06, |
| "loss": 0.2729, |
| "step": 1049 |
| }, |
| { |
| "epoch": 8.974358974358974, |
| "grad_norm": 5.2556352615356445, |
| "learning_rate": 5.512820512820514e-06, |
| "loss": 0.1422, |
| "step": 1050 |
| }, |
| { |
| "epoch": 8.982905982905983, |
| "grad_norm": 3.817049980163574, |
| "learning_rate": 5.50854700854701e-06, |
| "loss": 0.2514, |
| "step": 1051 |
| }, |
| { |
| "epoch": 8.991452991452991, |
| "grad_norm": 2.247227668762207, |
| "learning_rate": 5.504273504273505e-06, |
| "loss": 0.0703, |
| "step": 1052 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 34.36362838745117, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 0.7433, |
| "step": 1053 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.12675683200359344, |
| "eval_runtime": 9.3141, |
| "eval_samples_per_second": 50.032, |
| "eval_steps_per_second": 6.334, |
| "step": 1053 |
| }, |
| { |
| "epoch": 9.008547008547009, |
| "grad_norm": 5.314228057861328, |
| "learning_rate": 5.495726495726497e-06, |
| "loss": 0.2576, |
| "step": 1054 |
| }, |
| { |
| "epoch": 9.017094017094017, |
| "grad_norm": 34.33782958984375, |
| "learning_rate": 5.4914529914529925e-06, |
| "loss": 0.3833, |
| "step": 1055 |
| }, |
| { |
| "epoch": 9.025641025641026, |
| "grad_norm": 5.440598964691162, |
| "learning_rate": 5.487179487179488e-06, |
| "loss": 0.3898, |
| "step": 1056 |
| }, |
| { |
| "epoch": 9.034188034188034, |
| "grad_norm": 3.561518907546997, |
| "learning_rate": 5.482905982905984e-06, |
| "loss": 0.2197, |
| "step": 1057 |
| }, |
| { |
| "epoch": 9.042735042735043, |
| "grad_norm": 4.7679762840271, |
| "learning_rate": 5.478632478632479e-06, |
| "loss": 0.3885, |
| "step": 1058 |
| }, |
| { |
| "epoch": 9.051282051282051, |
| "grad_norm": 4.694134712219238, |
| "learning_rate": 5.474358974358975e-06, |
| "loss": 0.2532, |
| "step": 1059 |
| }, |
| { |
| "epoch": 9.05982905982906, |
| "grad_norm": 4.347025394439697, |
| "learning_rate": 5.470085470085471e-06, |
| "loss": 0.1949, |
| "step": 1060 |
| }, |
| { |
| "epoch": 9.068376068376068, |
| "grad_norm": 4.064525127410889, |
| "learning_rate": 5.465811965811966e-06, |
| "loss": 0.1597, |
| "step": 1061 |
| }, |
| { |
| "epoch": 9.076923076923077, |
| "grad_norm": 3.78560471534729, |
| "learning_rate": 5.461538461538461e-06, |
| "loss": 0.18, |
| "step": 1062 |
| }, |
| { |
| "epoch": 9.085470085470085, |
| "grad_norm": 7.843743324279785, |
| "learning_rate": 5.457264957264957e-06, |
| "loss": 0.3146, |
| "step": 1063 |
| }, |
| { |
| "epoch": 9.094017094017094, |
| "grad_norm": 8.152037620544434, |
| "learning_rate": 5.452991452991453e-06, |
| "loss": 0.3384, |
| "step": 1064 |
| }, |
| { |
| "epoch": 9.102564102564102, |
| "grad_norm": 3.987872838973999, |
| "learning_rate": 5.448717948717949e-06, |
| "loss": 0.2071, |
| "step": 1065 |
| }, |
| { |
| "epoch": 9.11111111111111, |
| "grad_norm": 3.478532552719116, |
| "learning_rate": 5.444444444444445e-06, |
| "loss": 0.1788, |
| "step": 1066 |
| }, |
| { |
| "epoch": 9.11965811965812, |
| "grad_norm": 3.6598286628723145, |
| "learning_rate": 5.44017094017094e-06, |
| "loss": 0.2459, |
| "step": 1067 |
| }, |
| { |
| "epoch": 9.128205128205128, |
| "grad_norm": 9.528829574584961, |
| "learning_rate": 5.435897435897436e-06, |
| "loss": 0.2046, |
| "step": 1068 |
| }, |
| { |
| "epoch": 9.136752136752136, |
| "grad_norm": 3.3274407386779785, |
| "learning_rate": 5.4316239316239316e-06, |
| "loss": 0.1414, |
| "step": 1069 |
| }, |
| { |
| "epoch": 9.145299145299145, |
| "grad_norm": 5.117324352264404, |
| "learning_rate": 5.4273504273504275e-06, |
| "loss": 0.3636, |
| "step": 1070 |
| }, |
| { |
| "epoch": 9.153846153846153, |
| "grad_norm": 8.604976654052734, |
| "learning_rate": 5.423076923076923e-06, |
| "loss": 0.2723, |
| "step": 1071 |
| }, |
| { |
| "epoch": 9.162393162393162, |
| "grad_norm": 72.67993927001953, |
| "learning_rate": 5.418803418803419e-06, |
| "loss": 0.5863, |
| "step": 1072 |
| }, |
| { |
| "epoch": 9.17094017094017, |
| "grad_norm": 3.8609094619750977, |
| "learning_rate": 5.414529914529914e-06, |
| "loss": 0.1778, |
| "step": 1073 |
| }, |
| { |
| "epoch": 9.179487179487179, |
| "grad_norm": 21.24209976196289, |
| "learning_rate": 5.41025641025641e-06, |
| "loss": 0.2062, |
| "step": 1074 |
| }, |
| { |
| "epoch": 9.188034188034187, |
| "grad_norm": 5.552285194396973, |
| "learning_rate": 5.405982905982906e-06, |
| "loss": 0.4685, |
| "step": 1075 |
| }, |
| { |
| "epoch": 9.196581196581196, |
| "grad_norm": 12.241254806518555, |
| "learning_rate": 5.401709401709402e-06, |
| "loss": 0.4309, |
| "step": 1076 |
| }, |
| { |
| "epoch": 9.205128205128204, |
| "grad_norm": 3.6276049613952637, |
| "learning_rate": 5.397435897435898e-06, |
| "loss": 0.0924, |
| "step": 1077 |
| }, |
| { |
| "epoch": 9.213675213675213, |
| "grad_norm": 10.98838996887207, |
| "learning_rate": 5.393162393162394e-06, |
| "loss": 0.7616, |
| "step": 1078 |
| }, |
| { |
| "epoch": 9.222222222222221, |
| "grad_norm": 4.689146041870117, |
| "learning_rate": 5.388888888888889e-06, |
| "loss": 0.346, |
| "step": 1079 |
| }, |
| { |
| "epoch": 9.23076923076923, |
| "grad_norm": 6.385439872741699, |
| "learning_rate": 5.384615384615385e-06, |
| "loss": 0.2945, |
| "step": 1080 |
| }, |
| { |
| "epoch": 9.239316239316238, |
| "grad_norm": 2.4931023120880127, |
| "learning_rate": 5.3803418803418806e-06, |
| "loss": 0.172, |
| "step": 1081 |
| }, |
| { |
| "epoch": 9.247863247863247, |
| "grad_norm": 3.797539472579956, |
| "learning_rate": 5.3760683760683764e-06, |
| "loss": 0.0927, |
| "step": 1082 |
| }, |
| { |
| "epoch": 9.256410256410255, |
| "grad_norm": 2.7136716842651367, |
| "learning_rate": 5.371794871794872e-06, |
| "loss": 0.0932, |
| "step": 1083 |
| }, |
| { |
| "epoch": 9.264957264957266, |
| "grad_norm": 5.207858085632324, |
| "learning_rate": 5.367521367521367e-06, |
| "loss": 0.1176, |
| "step": 1084 |
| }, |
| { |
| "epoch": 9.273504273504274, |
| "grad_norm": 3.95009183883667, |
| "learning_rate": 5.363247863247863e-06, |
| "loss": 0.3045, |
| "step": 1085 |
| }, |
| { |
| "epoch": 9.282051282051283, |
| "grad_norm": 1.9097685813903809, |
| "learning_rate": 5.358974358974359e-06, |
| "loss": 0.1793, |
| "step": 1086 |
| }, |
| { |
| "epoch": 9.290598290598291, |
| "grad_norm": 3.205216407775879, |
| "learning_rate": 5.354700854700855e-06, |
| "loss": 0.1071, |
| "step": 1087 |
| }, |
| { |
| "epoch": 9.2991452991453, |
| "grad_norm": 3.481822967529297, |
| "learning_rate": 5.350427350427351e-06, |
| "loss": 0.3885, |
| "step": 1088 |
| }, |
| { |
| "epoch": 9.307692307692308, |
| "grad_norm": 11.802562713623047, |
| "learning_rate": 5.346153846153847e-06, |
| "loss": 0.1769, |
| "step": 1089 |
| }, |
| { |
| "epoch": 9.316239316239317, |
| "grad_norm": 3.101505994796753, |
| "learning_rate": 5.341880341880342e-06, |
| "loss": 0.1265, |
| "step": 1090 |
| }, |
| { |
| "epoch": 9.324786324786325, |
| "grad_norm": 5.163032054901123, |
| "learning_rate": 5.337606837606838e-06, |
| "loss": 0.4768, |
| "step": 1091 |
| }, |
| { |
| "epoch": 9.333333333333334, |
| "grad_norm": 1.8217605352401733, |
| "learning_rate": 5.333333333333334e-06, |
| "loss": 0.053, |
| "step": 1092 |
| }, |
| { |
| "epoch": 9.341880341880342, |
| "grad_norm": 2.6139562129974365, |
| "learning_rate": 5.3290598290598295e-06, |
| "loss": 0.0848, |
| "step": 1093 |
| }, |
| { |
| "epoch": 9.350427350427351, |
| "grad_norm": 3.1172311305999756, |
| "learning_rate": 5.3247863247863254e-06, |
| "loss": 0.1076, |
| "step": 1094 |
| }, |
| { |
| "epoch": 9.35897435897436, |
| "grad_norm": 5.907342433929443, |
| "learning_rate": 5.320512820512821e-06, |
| "loss": 0.1737, |
| "step": 1095 |
| }, |
| { |
| "epoch": 9.367521367521368, |
| "grad_norm": 45.74967575073242, |
| "learning_rate": 5.316239316239316e-06, |
| "loss": 0.2455, |
| "step": 1096 |
| }, |
| { |
| "epoch": 9.376068376068377, |
| "grad_norm": 3.1865549087524414, |
| "learning_rate": 5.311965811965812e-06, |
| "loss": 0.2236, |
| "step": 1097 |
| }, |
| { |
| "epoch": 9.384615384615385, |
| "grad_norm": 4.028379917144775, |
| "learning_rate": 5.307692307692308e-06, |
| "loss": 0.1065, |
| "step": 1098 |
| }, |
| { |
| "epoch": 9.393162393162394, |
| "grad_norm": 5.388605117797852, |
| "learning_rate": 5.303418803418804e-06, |
| "loss": 0.2967, |
| "step": 1099 |
| }, |
| { |
| "epoch": 9.401709401709402, |
| "grad_norm": 3.661736249923706, |
| "learning_rate": 5.2991452991453e-06, |
| "loss": 0.1271, |
| "step": 1100 |
| }, |
| { |
| "epoch": 9.41025641025641, |
| "grad_norm": 4.693649768829346, |
| "learning_rate": 5.294871794871795e-06, |
| "loss": 0.7891, |
| "step": 1101 |
| }, |
| { |
| "epoch": 9.418803418803419, |
| "grad_norm": 14.75247573852539, |
| "learning_rate": 5.290598290598291e-06, |
| "loss": 0.707, |
| "step": 1102 |
| }, |
| { |
| "epoch": 9.427350427350428, |
| "grad_norm": 5.123616695404053, |
| "learning_rate": 5.286324786324787e-06, |
| "loss": 0.2424, |
| "step": 1103 |
| }, |
| { |
| "epoch": 9.435897435897436, |
| "grad_norm": 5.946259021759033, |
| "learning_rate": 5.282051282051283e-06, |
| "loss": 0.2558, |
| "step": 1104 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 3.3757872581481934, |
| "learning_rate": 5.2777777777777785e-06, |
| "loss": 0.072, |
| "step": 1105 |
| }, |
| { |
| "epoch": 9.452991452991453, |
| "grad_norm": 4.639676094055176, |
| "learning_rate": 5.2735042735042744e-06, |
| "loss": 0.1483, |
| "step": 1106 |
| }, |
| { |
| "epoch": 9.461538461538462, |
| "grad_norm": 5.552156925201416, |
| "learning_rate": 5.2692307692307695e-06, |
| "loss": 0.341, |
| "step": 1107 |
| }, |
| { |
| "epoch": 9.47008547008547, |
| "grad_norm": 10.601661682128906, |
| "learning_rate": 5.264957264957265e-06, |
| "loss": 0.5964, |
| "step": 1108 |
| }, |
| { |
| "epoch": 9.478632478632479, |
| "grad_norm": 4.391530513763428, |
| "learning_rate": 5.260683760683761e-06, |
| "loss": 0.2346, |
| "step": 1109 |
| }, |
| { |
| "epoch": 9.487179487179487, |
| "grad_norm": 3.150240659713745, |
| "learning_rate": 5.256410256410257e-06, |
| "loss": 0.1, |
| "step": 1110 |
| }, |
| { |
| "epoch": 9.495726495726496, |
| "grad_norm": 5.60894775390625, |
| "learning_rate": 5.252136752136753e-06, |
| "loss": 0.397, |
| "step": 1111 |
| }, |
| { |
| "epoch": 9.504273504273504, |
| "grad_norm": 9.21768856048584, |
| "learning_rate": 5.247863247863249e-06, |
| "loss": 0.2292, |
| "step": 1112 |
| }, |
| { |
| "epoch": 9.512820512820513, |
| "grad_norm": 8.351348876953125, |
| "learning_rate": 5.243589743589744e-06, |
| "loss": 0.3129, |
| "step": 1113 |
| }, |
| { |
| "epoch": 9.521367521367521, |
| "grad_norm": 3.0813419818878174, |
| "learning_rate": 5.23931623931624e-06, |
| "loss": 0.2539, |
| "step": 1114 |
| }, |
| { |
| "epoch": 9.52991452991453, |
| "grad_norm": 5.553039073944092, |
| "learning_rate": 5.235042735042736e-06, |
| "loss": 0.1121, |
| "step": 1115 |
| }, |
| { |
| "epoch": 9.538461538461538, |
| "grad_norm": 3.973057746887207, |
| "learning_rate": 5.230769230769232e-06, |
| "loss": 0.4928, |
| "step": 1116 |
| }, |
| { |
| "epoch": 9.547008547008547, |
| "grad_norm": 4.753414630889893, |
| "learning_rate": 5.2264957264957275e-06, |
| "loss": 0.2247, |
| "step": 1117 |
| }, |
| { |
| "epoch": 9.555555555555555, |
| "grad_norm": 7.344094753265381, |
| "learning_rate": 5.2222222222222226e-06, |
| "loss": 0.1405, |
| "step": 1118 |
| }, |
| { |
| "epoch": 9.564102564102564, |
| "grad_norm": 47.83219528198242, |
| "learning_rate": 5.2179487179487185e-06, |
| "loss": 0.3108, |
| "step": 1119 |
| }, |
| { |
| "epoch": 9.572649572649572, |
| "grad_norm": 2.31591796875, |
| "learning_rate": 5.213675213675214e-06, |
| "loss": 0.1019, |
| "step": 1120 |
| }, |
| { |
| "epoch": 9.581196581196581, |
| "grad_norm": 3.871413230895996, |
| "learning_rate": 5.20940170940171e-06, |
| "loss": 0.2562, |
| "step": 1121 |
| }, |
| { |
| "epoch": 9.58974358974359, |
| "grad_norm": 2.1789255142211914, |
| "learning_rate": 5.205128205128206e-06, |
| "loss": 0.0571, |
| "step": 1122 |
| }, |
| { |
| "epoch": 9.598290598290598, |
| "grad_norm": 4.119174957275391, |
| "learning_rate": 5.200854700854702e-06, |
| "loss": 0.2799, |
| "step": 1123 |
| }, |
| { |
| "epoch": 9.606837606837606, |
| "grad_norm": 7.873704433441162, |
| "learning_rate": 5.196581196581197e-06, |
| "loss": 0.2154, |
| "step": 1124 |
| }, |
| { |
| "epoch": 9.615384615384615, |
| "grad_norm": 3.386780023574829, |
| "learning_rate": 5.192307692307693e-06, |
| "loss": 0.1607, |
| "step": 1125 |
| }, |
| { |
| "epoch": 9.623931623931623, |
| "grad_norm": 3.3607964515686035, |
| "learning_rate": 5.188034188034189e-06, |
| "loss": 0.22, |
| "step": 1126 |
| }, |
| { |
| "epoch": 9.632478632478632, |
| "grad_norm": 10.655082702636719, |
| "learning_rate": 5.183760683760685e-06, |
| "loss": 0.2102, |
| "step": 1127 |
| }, |
| { |
| "epoch": 9.64102564102564, |
| "grad_norm": 5.550488471984863, |
| "learning_rate": 5.179487179487181e-06, |
| "loss": 0.347, |
| "step": 1128 |
| }, |
| { |
| "epoch": 9.649572649572649, |
| "grad_norm": 4.184569835662842, |
| "learning_rate": 5.1752136752136765e-06, |
| "loss": 0.183, |
| "step": 1129 |
| }, |
| { |
| "epoch": 9.658119658119658, |
| "grad_norm": 4.892969131469727, |
| "learning_rate": 5.1709401709401716e-06, |
| "loss": 0.2896, |
| "step": 1130 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "grad_norm": 5.926670074462891, |
| "learning_rate": 5.1666666666666675e-06, |
| "loss": 0.3321, |
| "step": 1131 |
| }, |
| { |
| "epoch": 9.675213675213675, |
| "grad_norm": 11.719461441040039, |
| "learning_rate": 5.162393162393163e-06, |
| "loss": 0.4055, |
| "step": 1132 |
| }, |
| { |
| "epoch": 9.683760683760683, |
| "grad_norm": 3.5666840076446533, |
| "learning_rate": 5.158119658119659e-06, |
| "loss": 0.2318, |
| "step": 1133 |
| }, |
| { |
| "epoch": 9.692307692307692, |
| "grad_norm": 6.800848484039307, |
| "learning_rate": 5.1538461538461534e-06, |
| "loss": 0.1202, |
| "step": 1134 |
| }, |
| { |
| "epoch": 9.7008547008547, |
| "grad_norm": 4.50139856338501, |
| "learning_rate": 5.149572649572649e-06, |
| "loss": 0.1914, |
| "step": 1135 |
| }, |
| { |
| "epoch": 9.709401709401709, |
| "grad_norm": 2.599607467651367, |
| "learning_rate": 5.145299145299145e-06, |
| "loss": 0.0833, |
| "step": 1136 |
| }, |
| { |
| "epoch": 9.717948717948717, |
| "grad_norm": 6.084483623504639, |
| "learning_rate": 5.141025641025641e-06, |
| "loss": 0.0907, |
| "step": 1137 |
| }, |
| { |
| "epoch": 9.726495726495726, |
| "grad_norm": 4.542915344238281, |
| "learning_rate": 5.136752136752137e-06, |
| "loss": 0.4554, |
| "step": 1138 |
| }, |
| { |
| "epoch": 9.735042735042736, |
| "grad_norm": 3.871166229248047, |
| "learning_rate": 5.132478632478632e-06, |
| "loss": 0.3037, |
| "step": 1139 |
| }, |
| { |
| "epoch": 9.743589743589745, |
| "grad_norm": 5.121057033538818, |
| "learning_rate": 5.128205128205128e-06, |
| "loss": 0.1751, |
| "step": 1140 |
| }, |
| { |
| "epoch": 9.752136752136753, |
| "grad_norm": 3.7517125606536865, |
| "learning_rate": 5.123931623931624e-06, |
| "loss": 0.3144, |
| "step": 1141 |
| }, |
| { |
| "epoch": 9.760683760683762, |
| "grad_norm": 1.7604278326034546, |
| "learning_rate": 5.11965811965812e-06, |
| "loss": 0.0649, |
| "step": 1142 |
| }, |
| { |
| "epoch": 9.76923076923077, |
| "grad_norm": 13.68947982788086, |
| "learning_rate": 5.115384615384616e-06, |
| "loss": 0.2184, |
| "step": 1143 |
| }, |
| { |
| "epoch": 9.777777777777779, |
| "grad_norm": 5.716836452484131, |
| "learning_rate": 5.1111111111111115e-06, |
| "loss": 0.1876, |
| "step": 1144 |
| }, |
| { |
| "epoch": 9.786324786324787, |
| "grad_norm": 8.21943187713623, |
| "learning_rate": 5.1068376068376065e-06, |
| "loss": 0.349, |
| "step": 1145 |
| }, |
| { |
| "epoch": 9.794871794871796, |
| "grad_norm": 5.270402908325195, |
| "learning_rate": 5.1025641025641024e-06, |
| "loss": 0.4442, |
| "step": 1146 |
| }, |
| { |
| "epoch": 9.803418803418804, |
| "grad_norm": 2.3825948238372803, |
| "learning_rate": 5.098290598290598e-06, |
| "loss": 0.2237, |
| "step": 1147 |
| }, |
| { |
| "epoch": 9.811965811965813, |
| "grad_norm": 11.812047958374023, |
| "learning_rate": 5.094017094017094e-06, |
| "loss": 0.5122, |
| "step": 1148 |
| }, |
| { |
| "epoch": 9.820512820512821, |
| "grad_norm": 9.14202880859375, |
| "learning_rate": 5.08974358974359e-06, |
| "loss": 0.3407, |
| "step": 1149 |
| }, |
| { |
| "epoch": 9.82905982905983, |
| "grad_norm": 5.273305892944336, |
| "learning_rate": 5.085470085470086e-06, |
| "loss": 0.1702, |
| "step": 1150 |
| }, |
| { |
| "epoch": 9.837606837606838, |
| "grad_norm": 2.995126485824585, |
| "learning_rate": 5.081196581196581e-06, |
| "loss": 0.228, |
| "step": 1151 |
| }, |
| { |
| "epoch": 9.846153846153847, |
| "grad_norm": 4.077675819396973, |
| "learning_rate": 5.076923076923077e-06, |
| "loss": 0.4022, |
| "step": 1152 |
| }, |
| { |
| "epoch": 9.854700854700855, |
| "grad_norm": 2.1732425689697266, |
| "learning_rate": 5.072649572649573e-06, |
| "loss": 0.1178, |
| "step": 1153 |
| }, |
| { |
| "epoch": 9.863247863247864, |
| "grad_norm": 2.905172109603882, |
| "learning_rate": 5.068376068376069e-06, |
| "loss": 0.1718, |
| "step": 1154 |
| }, |
| { |
| "epoch": 9.871794871794872, |
| "grad_norm": 2.702521324157715, |
| "learning_rate": 5.064102564102565e-06, |
| "loss": 0.1488, |
| "step": 1155 |
| }, |
| { |
| "epoch": 9.88034188034188, |
| "grad_norm": 2.414088487625122, |
| "learning_rate": 5.05982905982906e-06, |
| "loss": 0.1034, |
| "step": 1156 |
| }, |
| { |
| "epoch": 9.88888888888889, |
| "grad_norm": 2.618173360824585, |
| "learning_rate": 5.0555555555555555e-06, |
| "loss": 0.0783, |
| "step": 1157 |
| }, |
| { |
| "epoch": 9.897435897435898, |
| "grad_norm": 5.002628803253174, |
| "learning_rate": 5.051282051282051e-06, |
| "loss": 0.1195, |
| "step": 1158 |
| }, |
| { |
| "epoch": 9.905982905982906, |
| "grad_norm": 2.84708833694458, |
| "learning_rate": 5.047008547008547e-06, |
| "loss": 0.0906, |
| "step": 1159 |
| }, |
| { |
| "epoch": 9.914529914529915, |
| "grad_norm": 5.564020156860352, |
| "learning_rate": 5.042735042735043e-06, |
| "loss": 0.2037, |
| "step": 1160 |
| }, |
| { |
| "epoch": 9.923076923076923, |
| "grad_norm": 3.7763166427612305, |
| "learning_rate": 5.038461538461539e-06, |
| "loss": 0.2067, |
| "step": 1161 |
| }, |
| { |
| "epoch": 9.931623931623932, |
| "grad_norm": 2.67268705368042, |
| "learning_rate": 5.034188034188034e-06, |
| "loss": 0.0557, |
| "step": 1162 |
| }, |
| { |
| "epoch": 9.94017094017094, |
| "grad_norm": 2.4144680500030518, |
| "learning_rate": 5.02991452991453e-06, |
| "loss": 0.194, |
| "step": 1163 |
| }, |
| { |
| "epoch": 9.948717948717949, |
| "grad_norm": 2.0716731548309326, |
| "learning_rate": 5.025641025641026e-06, |
| "loss": 0.1253, |
| "step": 1164 |
| }, |
| { |
| "epoch": 9.957264957264957, |
| "grad_norm": 13.20478630065918, |
| "learning_rate": 5.021367521367522e-06, |
| "loss": 0.268, |
| "step": 1165 |
| }, |
| { |
| "epoch": 9.965811965811966, |
| "grad_norm": 2.093698263168335, |
| "learning_rate": 5.017094017094018e-06, |
| "loss": 0.0738, |
| "step": 1166 |
| }, |
| { |
| "epoch": 9.974358974358974, |
| "grad_norm": 2.2758119106292725, |
| "learning_rate": 5.012820512820514e-06, |
| "loss": 0.0804, |
| "step": 1167 |
| }, |
| { |
| "epoch": 9.982905982905983, |
| "grad_norm": 21.843395233154297, |
| "learning_rate": 5.008547008547009e-06, |
| "loss": 0.3298, |
| "step": 1168 |
| }, |
| { |
| "epoch": 9.991452991452991, |
| "grad_norm": 3.0435073375701904, |
| "learning_rate": 5.0042735042735045e-06, |
| "loss": 0.1318, |
| "step": 1169 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 8.449163436889648, |
| "learning_rate": 5e-06, |
| "loss": 0.1725, |
| "step": 1170 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.10285739600658417, |
| "eval_runtime": 9.2384, |
| "eval_samples_per_second": 50.441, |
| "eval_steps_per_second": 6.386, |
| "step": 1170 |
| }, |
| { |
| "epoch": 10.008547008547009, |
| "grad_norm": 4.151456356048584, |
| "learning_rate": 4.995726495726496e-06, |
| "loss": 0.3336, |
| "step": 1171 |
| }, |
| { |
| "epoch": 10.017094017094017, |
| "grad_norm": 2.38647723197937, |
| "learning_rate": 4.991452991452992e-06, |
| "loss": 0.1138, |
| "step": 1172 |
| }, |
| { |
| "epoch": 10.025641025641026, |
| "grad_norm": 4.44817590713501, |
| "learning_rate": 4.987179487179487e-06, |
| "loss": 0.0954, |
| "step": 1173 |
| }, |
| { |
| "epoch": 10.034188034188034, |
| "grad_norm": 2.6213347911834717, |
| "learning_rate": 4.982905982905983e-06, |
| "loss": 0.0695, |
| "step": 1174 |
| }, |
| { |
| "epoch": 10.042735042735043, |
| "grad_norm": 4.664891719818115, |
| "learning_rate": 4.978632478632479e-06, |
| "loss": 0.1067, |
| "step": 1175 |
| }, |
| { |
| "epoch": 10.051282051282051, |
| "grad_norm": 1.7059048414230347, |
| "learning_rate": 4.974358974358975e-06, |
| "loss": 0.0321, |
| "step": 1176 |
| }, |
| { |
| "epoch": 10.05982905982906, |
| "grad_norm": 5.123709678649902, |
| "learning_rate": 4.970085470085471e-06, |
| "loss": 0.2117, |
| "step": 1177 |
| }, |
| { |
| "epoch": 10.068376068376068, |
| "grad_norm": 2.2717695236206055, |
| "learning_rate": 4.965811965811967e-06, |
| "loss": 0.2187, |
| "step": 1178 |
| }, |
| { |
| "epoch": 10.076923076923077, |
| "grad_norm": 4.669886112213135, |
| "learning_rate": 4.961538461538462e-06, |
| "loss": 0.4615, |
| "step": 1179 |
| }, |
| { |
| "epoch": 10.085470085470085, |
| "grad_norm": 18.739727020263672, |
| "learning_rate": 4.957264957264958e-06, |
| "loss": 0.3431, |
| "step": 1180 |
| }, |
| { |
| "epoch": 10.094017094017094, |
| "grad_norm": 7.798559188842773, |
| "learning_rate": 4.9529914529914535e-06, |
| "loss": 0.2483, |
| "step": 1181 |
| }, |
| { |
| "epoch": 10.102564102564102, |
| "grad_norm": 22.59453773498535, |
| "learning_rate": 4.948717948717949e-06, |
| "loss": 0.15, |
| "step": 1182 |
| }, |
| { |
| "epoch": 10.11111111111111, |
| "grad_norm": 2.5734364986419678, |
| "learning_rate": 4.944444444444445e-06, |
| "loss": 0.0465, |
| "step": 1183 |
| }, |
| { |
| "epoch": 10.11965811965812, |
| "grad_norm": 3.1944875717163086, |
| "learning_rate": 4.940170940170941e-06, |
| "loss": 0.1429, |
| "step": 1184 |
| }, |
| { |
| "epoch": 10.128205128205128, |
| "grad_norm": 1.6943906545639038, |
| "learning_rate": 4.935897435897436e-06, |
| "loss": 0.0685, |
| "step": 1185 |
| }, |
| { |
| "epoch": 10.136752136752136, |
| "grad_norm": 4.497282981872559, |
| "learning_rate": 4.931623931623932e-06, |
| "loss": 0.2113, |
| "step": 1186 |
| }, |
| { |
| "epoch": 10.145299145299145, |
| "grad_norm": 2.9377167224884033, |
| "learning_rate": 4.927350427350428e-06, |
| "loss": 0.1352, |
| "step": 1187 |
| }, |
| { |
| "epoch": 10.153846153846153, |
| "grad_norm": 8.528215408325195, |
| "learning_rate": 4.923076923076924e-06, |
| "loss": 0.3268, |
| "step": 1188 |
| }, |
| { |
| "epoch": 10.162393162393162, |
| "grad_norm": 2.143850803375244, |
| "learning_rate": 4.918803418803419e-06, |
| "loss": 0.0923, |
| "step": 1189 |
| }, |
| { |
| "epoch": 10.17094017094017, |
| "grad_norm": 3.921250343322754, |
| "learning_rate": 4.914529914529915e-06, |
| "loss": 0.1451, |
| "step": 1190 |
| }, |
| { |
| "epoch": 10.179487179487179, |
| "grad_norm": 10.713285446166992, |
| "learning_rate": 4.910256410256411e-06, |
| "loss": 0.17, |
| "step": 1191 |
| }, |
| { |
| "epoch": 10.188034188034187, |
| "grad_norm": 2.450204849243164, |
| "learning_rate": 4.905982905982906e-06, |
| "loss": 0.0765, |
| "step": 1192 |
| }, |
| { |
| "epoch": 10.196581196581196, |
| "grad_norm": 4.750647068023682, |
| "learning_rate": 4.901709401709402e-06, |
| "loss": 0.2829, |
| "step": 1193 |
| }, |
| { |
| "epoch": 10.205128205128204, |
| "grad_norm": 12.714463233947754, |
| "learning_rate": 4.8974358974358975e-06, |
| "loss": 0.6767, |
| "step": 1194 |
| }, |
| { |
| "epoch": 10.213675213675213, |
| "grad_norm": 6.759951591491699, |
| "learning_rate": 4.8931623931623934e-06, |
| "loss": 0.2369, |
| "step": 1195 |
| }, |
| { |
| "epoch": 10.222222222222221, |
| "grad_norm": 8.592784881591797, |
| "learning_rate": 4.888888888888889e-06, |
| "loss": 0.4203, |
| "step": 1196 |
| }, |
| { |
| "epoch": 10.23076923076923, |
| "grad_norm": 5.04047155380249, |
| "learning_rate": 4.884615384615385e-06, |
| "loss": 0.1023, |
| "step": 1197 |
| }, |
| { |
| "epoch": 10.239316239316238, |
| "grad_norm": 38.112152099609375, |
| "learning_rate": 4.88034188034188e-06, |
| "loss": 0.4686, |
| "step": 1198 |
| }, |
| { |
| "epoch": 10.247863247863247, |
| "grad_norm": 6.751104354858398, |
| "learning_rate": 4.876068376068376e-06, |
| "loss": 0.085, |
| "step": 1199 |
| }, |
| { |
| "epoch": 10.256410256410255, |
| "grad_norm": 4.3117594718933105, |
| "learning_rate": 4.871794871794872e-06, |
| "loss": 0.1504, |
| "step": 1200 |
| }, |
| { |
| "epoch": 10.264957264957266, |
| "grad_norm": 2.251265287399292, |
| "learning_rate": 4.867521367521368e-06, |
| "loss": 0.1664, |
| "step": 1201 |
| }, |
| { |
| "epoch": 10.273504273504274, |
| "grad_norm": 2.1650373935699463, |
| "learning_rate": 4.863247863247864e-06, |
| "loss": 0.0959, |
| "step": 1202 |
| }, |
| { |
| "epoch": 10.282051282051283, |
| "grad_norm": 2.5863089561462402, |
| "learning_rate": 4.85897435897436e-06, |
| "loss": 0.1148, |
| "step": 1203 |
| }, |
| { |
| "epoch": 10.290598290598291, |
| "grad_norm": 1.974357008934021, |
| "learning_rate": 4.854700854700855e-06, |
| "loss": 0.0663, |
| "step": 1204 |
| }, |
| { |
| "epoch": 10.2991452991453, |
| "grad_norm": 2.3226940631866455, |
| "learning_rate": 4.850427350427351e-06, |
| "loss": 0.1363, |
| "step": 1205 |
| }, |
| { |
| "epoch": 10.307692307692308, |
| "grad_norm": 4.034085750579834, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 0.3473, |
| "step": 1206 |
| }, |
| { |
| "epoch": 10.316239316239317, |
| "grad_norm": 2.492307186126709, |
| "learning_rate": 4.8418803418803424e-06, |
| "loss": 0.1742, |
| "step": 1207 |
| }, |
| { |
| "epoch": 10.324786324786325, |
| "grad_norm": 2.886432409286499, |
| "learning_rate": 4.837606837606838e-06, |
| "loss": 0.1382, |
| "step": 1208 |
| }, |
| { |
| "epoch": 10.333333333333334, |
| "grad_norm": 3.6314749717712402, |
| "learning_rate": 4.833333333333333e-06, |
| "loss": 0.1556, |
| "step": 1209 |
| }, |
| { |
| "epoch": 10.341880341880342, |
| "grad_norm": 2.2757928371429443, |
| "learning_rate": 4.829059829059829e-06, |
| "loss": 0.0434, |
| "step": 1210 |
| }, |
| { |
| "epoch": 10.350427350427351, |
| "grad_norm": 3.4152615070343018, |
| "learning_rate": 4.824786324786325e-06, |
| "loss": 0.2903, |
| "step": 1211 |
| }, |
| { |
| "epoch": 10.35897435897436, |
| "grad_norm": 3.873960256576538, |
| "learning_rate": 4.820512820512821e-06, |
| "loss": 0.2611, |
| "step": 1212 |
| }, |
| { |
| "epoch": 10.367521367521368, |
| "grad_norm": 4.2241291999816895, |
| "learning_rate": 4.816239316239317e-06, |
| "loss": 0.0954, |
| "step": 1213 |
| }, |
| { |
| "epoch": 10.376068376068377, |
| "grad_norm": 5.454725742340088, |
| "learning_rate": 4.811965811965813e-06, |
| "loss": 0.1361, |
| "step": 1214 |
| }, |
| { |
| "epoch": 10.384615384615385, |
| "grad_norm": 3.482558012008667, |
| "learning_rate": 4.807692307692308e-06, |
| "loss": 0.0861, |
| "step": 1215 |
| }, |
| { |
| "epoch": 10.393162393162394, |
| "grad_norm": 2.301254987716675, |
| "learning_rate": 4.803418803418804e-06, |
| "loss": 0.1571, |
| "step": 1216 |
| }, |
| { |
| "epoch": 10.401709401709402, |
| "grad_norm": 6.0665602684021, |
| "learning_rate": 4.7991452991453e-06, |
| "loss": 0.5323, |
| "step": 1217 |
| }, |
| { |
| "epoch": 10.41025641025641, |
| "grad_norm": 3.6052770614624023, |
| "learning_rate": 4.7948717948717955e-06, |
| "loss": 0.3789, |
| "step": 1218 |
| }, |
| { |
| "epoch": 10.418803418803419, |
| "grad_norm": 3.9434757232666016, |
| "learning_rate": 4.790598290598291e-06, |
| "loss": 0.0605, |
| "step": 1219 |
| }, |
| { |
| "epoch": 10.427350427350428, |
| "grad_norm": 5.260069847106934, |
| "learning_rate": 4.786324786324787e-06, |
| "loss": 0.3163, |
| "step": 1220 |
| }, |
| { |
| "epoch": 10.435897435897436, |
| "grad_norm": 5.219394207000732, |
| "learning_rate": 4.782051282051282e-06, |
| "loss": 0.4339, |
| "step": 1221 |
| }, |
| { |
| "epoch": 10.444444444444445, |
| "grad_norm": 2.7057230472564697, |
| "learning_rate": 4.777777777777778e-06, |
| "loss": 0.0787, |
| "step": 1222 |
| }, |
| { |
| "epoch": 10.452991452991453, |
| "grad_norm": 11.005247116088867, |
| "learning_rate": 4.773504273504274e-06, |
| "loss": 0.255, |
| "step": 1223 |
| }, |
| { |
| "epoch": 10.461538461538462, |
| "grad_norm": 1.7238801717758179, |
| "learning_rate": 4.76923076923077e-06, |
| "loss": 0.0605, |
| "step": 1224 |
| }, |
| { |
| "epoch": 10.47008547008547, |
| "grad_norm": 6.509312629699707, |
| "learning_rate": 4.764957264957265e-06, |
| "loss": 0.2899, |
| "step": 1225 |
| }, |
| { |
| "epoch": 10.478632478632479, |
| "grad_norm": 7.1476359367370605, |
| "learning_rate": 4.760683760683761e-06, |
| "loss": 0.336, |
| "step": 1226 |
| }, |
| { |
| "epoch": 10.487179487179487, |
| "grad_norm": 15.92902660369873, |
| "learning_rate": 4.756410256410257e-06, |
| "loss": 0.4864, |
| "step": 1227 |
| }, |
| { |
| "epoch": 10.495726495726496, |
| "grad_norm": 5.545684337615967, |
| "learning_rate": 4.752136752136752e-06, |
| "loss": 0.4741, |
| "step": 1228 |
| }, |
| { |
| "epoch": 10.504273504273504, |
| "grad_norm": 3.2521066665649414, |
| "learning_rate": 4.747863247863248e-06, |
| "loss": 0.0894, |
| "step": 1229 |
| }, |
| { |
| "epoch": 10.512820512820513, |
| "grad_norm": 2.696866512298584, |
| "learning_rate": 4.743589743589744e-06, |
| "loss": 0.111, |
| "step": 1230 |
| }, |
| { |
| "epoch": 10.521367521367521, |
| "grad_norm": 1.8362340927124023, |
| "learning_rate": 4.7393162393162396e-06, |
| "loss": 0.0579, |
| "step": 1231 |
| }, |
| { |
| "epoch": 10.52991452991453, |
| "grad_norm": 2.96872878074646, |
| "learning_rate": 4.7350427350427355e-06, |
| "loss": 0.0781, |
| "step": 1232 |
| }, |
| { |
| "epoch": 10.538461538461538, |
| "grad_norm": 1.5503445863723755, |
| "learning_rate": 4.730769230769231e-06, |
| "loss": 0.0451, |
| "step": 1233 |
| }, |
| { |
| "epoch": 10.547008547008547, |
| "grad_norm": 3.9600377082824707, |
| "learning_rate": 4.726495726495726e-06, |
| "loss": 0.1721, |
| "step": 1234 |
| }, |
| { |
| "epoch": 10.555555555555555, |
| "grad_norm": 3.3868823051452637, |
| "learning_rate": 4.722222222222222e-06, |
| "loss": 0.1803, |
| "step": 1235 |
| }, |
| { |
| "epoch": 10.564102564102564, |
| "grad_norm": 2.528111219406128, |
| "learning_rate": 4.717948717948718e-06, |
| "loss": 0.238, |
| "step": 1236 |
| }, |
| { |
| "epoch": 10.572649572649572, |
| "grad_norm": 6.960350036621094, |
| "learning_rate": 4.713675213675214e-06, |
| "loss": 0.4353, |
| "step": 1237 |
| }, |
| { |
| "epoch": 10.581196581196581, |
| "grad_norm": 2.3169686794281006, |
| "learning_rate": 4.70940170940171e-06, |
| "loss": 0.1891, |
| "step": 1238 |
| }, |
| { |
| "epoch": 10.58974358974359, |
| "grad_norm": 2.021212577819824, |
| "learning_rate": 4.705128205128206e-06, |
| "loss": 0.0865, |
| "step": 1239 |
| }, |
| { |
| "epoch": 10.598290598290598, |
| "grad_norm": 2.445462942123413, |
| "learning_rate": 4.700854700854701e-06, |
| "loss": 0.0973, |
| "step": 1240 |
| }, |
| { |
| "epoch": 10.606837606837606, |
| "grad_norm": 3.4490067958831787, |
| "learning_rate": 4.696581196581197e-06, |
| "loss": 0.1419, |
| "step": 1241 |
| }, |
| { |
| "epoch": 10.615384615384615, |
| "grad_norm": 3.2859914302825928, |
| "learning_rate": 4.692307692307693e-06, |
| "loss": 0.1587, |
| "step": 1242 |
| }, |
| { |
| "epoch": 10.623931623931623, |
| "grad_norm": 4.754831790924072, |
| "learning_rate": 4.6880341880341886e-06, |
| "loss": 0.2537, |
| "step": 1243 |
| }, |
| { |
| "epoch": 10.632478632478632, |
| "grad_norm": 3.220867156982422, |
| "learning_rate": 4.6837606837606844e-06, |
| "loss": 0.0941, |
| "step": 1244 |
| }, |
| { |
| "epoch": 10.64102564102564, |
| "grad_norm": 5.699328422546387, |
| "learning_rate": 4.6794871794871795e-06, |
| "loss": 0.255, |
| "step": 1245 |
| }, |
| { |
| "epoch": 10.649572649572649, |
| "grad_norm": 1.5174522399902344, |
| "learning_rate": 4.675213675213675e-06, |
| "loss": 0.048, |
| "step": 1246 |
| }, |
| { |
| "epoch": 10.658119658119658, |
| "grad_norm": 2.4277050495147705, |
| "learning_rate": 4.670940170940171e-06, |
| "loss": 0.1127, |
| "step": 1247 |
| }, |
| { |
| "epoch": 10.666666666666666, |
| "grad_norm": 2.079031229019165, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 0.1038, |
| "step": 1248 |
| }, |
| { |
| "epoch": 10.675213675213675, |
| "grad_norm": 953.4605102539062, |
| "learning_rate": 4.662393162393163e-06, |
| "loss": 1.1892, |
| "step": 1249 |
| }, |
| { |
| "epoch": 10.683760683760683, |
| "grad_norm": 9.190105438232422, |
| "learning_rate": 4.658119658119659e-06, |
| "loss": 0.3541, |
| "step": 1250 |
| }, |
| { |
| "epoch": 10.692307692307692, |
| "grad_norm": 2.3222947120666504, |
| "learning_rate": 4.653846153846154e-06, |
| "loss": 0.0842, |
| "step": 1251 |
| }, |
| { |
| "epoch": 10.7008547008547, |
| "grad_norm": 2.2312700748443604, |
| "learning_rate": 4.64957264957265e-06, |
| "loss": 0.088, |
| "step": 1252 |
| }, |
| { |
| "epoch": 10.709401709401709, |
| "grad_norm": 3.987630844116211, |
| "learning_rate": 4.645299145299146e-06, |
| "loss": 0.1667, |
| "step": 1253 |
| }, |
| { |
| "epoch": 10.717948717948717, |
| "grad_norm": 5.108981609344482, |
| "learning_rate": 4.641025641025642e-06, |
| "loss": 0.4291, |
| "step": 1254 |
| }, |
| { |
| "epoch": 10.726495726495726, |
| "grad_norm": 2.8597464561462402, |
| "learning_rate": 4.6367521367521375e-06, |
| "loss": 0.0564, |
| "step": 1255 |
| }, |
| { |
| "epoch": 10.735042735042736, |
| "grad_norm": 2.3642940521240234, |
| "learning_rate": 4.6324786324786334e-06, |
| "loss": 0.0909, |
| "step": 1256 |
| }, |
| { |
| "epoch": 10.743589743589745, |
| "grad_norm": 1.5703462362289429, |
| "learning_rate": 4.6282051282051285e-06, |
| "loss": 0.0395, |
| "step": 1257 |
| }, |
| { |
| "epoch": 10.752136752136753, |
| "grad_norm": 2.952786922454834, |
| "learning_rate": 4.623931623931624e-06, |
| "loss": 0.1824, |
| "step": 1258 |
| }, |
| { |
| "epoch": 10.760683760683762, |
| "grad_norm": 2.9027185440063477, |
| "learning_rate": 4.61965811965812e-06, |
| "loss": 0.0765, |
| "step": 1259 |
| }, |
| { |
| "epoch": 10.76923076923077, |
| "grad_norm": 2.4386038780212402, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.2761, |
| "step": 1260 |
| }, |
| { |
| "epoch": 10.777777777777779, |
| "grad_norm": 7.146468639373779, |
| "learning_rate": 4.611111111111112e-06, |
| "loss": 0.4427, |
| "step": 1261 |
| }, |
| { |
| "epoch": 10.786324786324787, |
| "grad_norm": 2.002096652984619, |
| "learning_rate": 4.606837606837607e-06, |
| "loss": 0.0879, |
| "step": 1262 |
| }, |
| { |
| "epoch": 10.794871794871796, |
| "grad_norm": 6.504697322845459, |
| "learning_rate": 4.602564102564103e-06, |
| "loss": 0.1805, |
| "step": 1263 |
| }, |
| { |
| "epoch": 10.803418803418804, |
| "grad_norm": 9.748340606689453, |
| "learning_rate": 4.598290598290598e-06, |
| "loss": 0.5813, |
| "step": 1264 |
| }, |
| { |
| "epoch": 10.811965811965813, |
| "grad_norm": 3.67153000831604, |
| "learning_rate": 4.594017094017094e-06, |
| "loss": 0.4175, |
| "step": 1265 |
| }, |
| { |
| "epoch": 10.820512820512821, |
| "grad_norm": 9.109044075012207, |
| "learning_rate": 4.58974358974359e-06, |
| "loss": 0.4505, |
| "step": 1266 |
| }, |
| { |
| "epoch": 10.82905982905983, |
| "grad_norm": 5.419683933258057, |
| "learning_rate": 4.585470085470086e-06, |
| "loss": 0.2316, |
| "step": 1267 |
| }, |
| { |
| "epoch": 10.837606837606838, |
| "grad_norm": 2.901182174682617, |
| "learning_rate": 4.581196581196582e-06, |
| "loss": 0.0583, |
| "step": 1268 |
| }, |
| { |
| "epoch": 10.846153846153847, |
| "grad_norm": 4.579897403717041, |
| "learning_rate": 4.5769230769230775e-06, |
| "loss": 0.0536, |
| "step": 1269 |
| }, |
| { |
| "epoch": 10.854700854700855, |
| "grad_norm": 4.232446670532227, |
| "learning_rate": 4.5726495726495725e-06, |
| "loss": 0.17, |
| "step": 1270 |
| }, |
| { |
| "epoch": 10.863247863247864, |
| "grad_norm": 8.059329986572266, |
| "learning_rate": 4.568376068376068e-06, |
| "loss": 0.256, |
| "step": 1271 |
| }, |
| { |
| "epoch": 10.871794871794872, |
| "grad_norm": 1.5736984014511108, |
| "learning_rate": 4.564102564102564e-06, |
| "loss": 0.058, |
| "step": 1272 |
| }, |
| { |
| "epoch": 10.88034188034188, |
| "grad_norm": 5.397885799407959, |
| "learning_rate": 4.55982905982906e-06, |
| "loss": 0.1299, |
| "step": 1273 |
| }, |
| { |
| "epoch": 10.88888888888889, |
| "grad_norm": 3.9831533432006836, |
| "learning_rate": 4.555555555555556e-06, |
| "loss": 0.1762, |
| "step": 1274 |
| }, |
| { |
| "epoch": 10.897435897435898, |
| "grad_norm": 2.170370101928711, |
| "learning_rate": 4.551282051282052e-06, |
| "loss": 0.1355, |
| "step": 1275 |
| }, |
| { |
| "epoch": 10.905982905982906, |
| "grad_norm": 5.151463508605957, |
| "learning_rate": 4.547008547008547e-06, |
| "loss": 0.3151, |
| "step": 1276 |
| }, |
| { |
| "epoch": 10.914529914529915, |
| "grad_norm": 2.215559482574463, |
| "learning_rate": 4.542735042735043e-06, |
| "loss": 0.1054, |
| "step": 1277 |
| }, |
| { |
| "epoch": 10.923076923076923, |
| "grad_norm": 3.62188458442688, |
| "learning_rate": 4.538461538461539e-06, |
| "loss": 0.3839, |
| "step": 1278 |
| }, |
| { |
| "epoch": 10.931623931623932, |
| "grad_norm": 1.8855514526367188, |
| "learning_rate": 4.534188034188035e-06, |
| "loss": 0.0639, |
| "step": 1279 |
| }, |
| { |
| "epoch": 10.94017094017094, |
| "grad_norm": 3.0260651111602783, |
| "learning_rate": 4.5299145299145306e-06, |
| "loss": 0.1216, |
| "step": 1280 |
| }, |
| { |
| "epoch": 10.948717948717949, |
| "grad_norm": 13.30820083618164, |
| "learning_rate": 4.525641025641026e-06, |
| "loss": 0.3337, |
| "step": 1281 |
| }, |
| { |
| "epoch": 10.957264957264957, |
| "grad_norm": 4.356720447540283, |
| "learning_rate": 4.5213675213675215e-06, |
| "loss": 0.2692, |
| "step": 1282 |
| }, |
| { |
| "epoch": 10.965811965811966, |
| "grad_norm": 2.077742576599121, |
| "learning_rate": 4.517094017094017e-06, |
| "loss": 0.1181, |
| "step": 1283 |
| }, |
| { |
| "epoch": 10.974358974358974, |
| "grad_norm": 6.6224284172058105, |
| "learning_rate": 4.512820512820513e-06, |
| "loss": 0.1526, |
| "step": 1284 |
| }, |
| { |
| "epoch": 10.982905982905983, |
| "grad_norm": 4.072678565979004, |
| "learning_rate": 4.508547008547009e-06, |
| "loss": 0.1804, |
| "step": 1285 |
| }, |
| { |
| "epoch": 10.991452991452991, |
| "grad_norm": 3.430922269821167, |
| "learning_rate": 4.504273504273505e-06, |
| "loss": 0.1316, |
| "step": 1286 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 1.6371959447860718, |
| "learning_rate": 4.5e-06, |
| "loss": 0.0596, |
| "step": 1287 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.08654214441776276, |
| "eval_runtime": 9.3013, |
| "eval_samples_per_second": 50.1, |
| "eval_steps_per_second": 6.343, |
| "step": 1287 |
| }, |
| { |
| "epoch": 11.008547008547009, |
| "grad_norm": 5.072701454162598, |
| "learning_rate": 4.495726495726496e-06, |
| "loss": 0.2195, |
| "step": 1288 |
| }, |
| { |
| "epoch": 11.017094017094017, |
| "grad_norm": 6.791895389556885, |
| "learning_rate": 4.491452991452992e-06, |
| "loss": 0.5354, |
| "step": 1289 |
| }, |
| { |
| "epoch": 11.025641025641026, |
| "grad_norm": 12.475218772888184, |
| "learning_rate": 4.487179487179488e-06, |
| "loss": 0.1828, |
| "step": 1290 |
| }, |
| { |
| "epoch": 11.034188034188034, |
| "grad_norm": 5.892624855041504, |
| "learning_rate": 4.482905982905984e-06, |
| "loss": 0.1617, |
| "step": 1291 |
| }, |
| { |
| "epoch": 11.042735042735043, |
| "grad_norm": 1.742074728012085, |
| "learning_rate": 4.4786324786324796e-06, |
| "loss": 0.0508, |
| "step": 1292 |
| }, |
| { |
| "epoch": 11.051282051282051, |
| "grad_norm": 2.389373302459717, |
| "learning_rate": 4.474358974358975e-06, |
| "loss": 0.1009, |
| "step": 1293 |
| }, |
| { |
| "epoch": 11.05982905982906, |
| "grad_norm": 3.7152106761932373, |
| "learning_rate": 4.4700854700854705e-06, |
| "loss": 0.2157, |
| "step": 1294 |
| }, |
| { |
| "epoch": 11.068376068376068, |
| "grad_norm": 7.217955112457275, |
| "learning_rate": 4.465811965811966e-06, |
| "loss": 0.2737, |
| "step": 1295 |
| }, |
| { |
| "epoch": 11.076923076923077, |
| "grad_norm": 2.0971977710723877, |
| "learning_rate": 4.461538461538462e-06, |
| "loss": 0.1273, |
| "step": 1296 |
| }, |
| { |
| "epoch": 11.085470085470085, |
| "grad_norm": 1.1616859436035156, |
| "learning_rate": 4.457264957264958e-06, |
| "loss": 0.0325, |
| "step": 1297 |
| }, |
| { |
| "epoch": 11.094017094017094, |
| "grad_norm": 3.4287424087524414, |
| "learning_rate": 4.452991452991453e-06, |
| "loss": 0.1136, |
| "step": 1298 |
| }, |
| { |
| "epoch": 11.102564102564102, |
| "grad_norm": 1.6207005977630615, |
| "learning_rate": 4.448717948717949e-06, |
| "loss": 0.0344, |
| "step": 1299 |
| }, |
| { |
| "epoch": 11.11111111111111, |
| "grad_norm": 3.009976863861084, |
| "learning_rate": 4.444444444444444e-06, |
| "loss": 0.1532, |
| "step": 1300 |
| }, |
| { |
| "epoch": 11.11965811965812, |
| "grad_norm": 2.9768505096435547, |
| "learning_rate": 4.44017094017094e-06, |
| "loss": 0.0874, |
| "step": 1301 |
| }, |
| { |
| "epoch": 11.128205128205128, |
| "grad_norm": 3.622715473175049, |
| "learning_rate": 4.435897435897436e-06, |
| "loss": 0.3132, |
| "step": 1302 |
| }, |
| { |
| "epoch": 11.136752136752136, |
| "grad_norm": 3.5741326808929443, |
| "learning_rate": 4.431623931623932e-06, |
| "loss": 0.0914, |
| "step": 1303 |
| }, |
| { |
| "epoch": 11.145299145299145, |
| "grad_norm": 7.436197280883789, |
| "learning_rate": 4.427350427350428e-06, |
| "loss": 0.329, |
| "step": 1304 |
| }, |
| { |
| "epoch": 11.153846153846153, |
| "grad_norm": 2.390066146850586, |
| "learning_rate": 4.423076923076924e-06, |
| "loss": 0.0867, |
| "step": 1305 |
| }, |
| { |
| "epoch": 11.162393162393162, |
| "grad_norm": 1.928227424621582, |
| "learning_rate": 4.418803418803419e-06, |
| "loss": 0.0294, |
| "step": 1306 |
| }, |
| { |
| "epoch": 11.17094017094017, |
| "grad_norm": 4.40464448928833, |
| "learning_rate": 4.4145299145299145e-06, |
| "loss": 0.3704, |
| "step": 1307 |
| }, |
| { |
| "epoch": 11.179487179487179, |
| "grad_norm": 22.183835983276367, |
| "learning_rate": 4.4102564102564104e-06, |
| "loss": 0.6011, |
| "step": 1308 |
| }, |
| { |
| "epoch": 11.188034188034187, |
| "grad_norm": 2.496633768081665, |
| "learning_rate": 4.405982905982906e-06, |
| "loss": 0.0494, |
| "step": 1309 |
| }, |
| { |
| "epoch": 11.196581196581196, |
| "grad_norm": 1.142687201499939, |
| "learning_rate": 4.401709401709402e-06, |
| "loss": 0.0292, |
| "step": 1310 |
| }, |
| { |
| "epoch": 11.205128205128204, |
| "grad_norm": 2.0762455463409424, |
| "learning_rate": 4.397435897435898e-06, |
| "loss": 0.1123, |
| "step": 1311 |
| }, |
| { |
| "epoch": 11.213675213675213, |
| "grad_norm": 1.5389565229415894, |
| "learning_rate": 4.393162393162393e-06, |
| "loss": 0.0316, |
| "step": 1312 |
| }, |
| { |
| "epoch": 11.222222222222221, |
| "grad_norm": 4.252040386199951, |
| "learning_rate": 4.388888888888889e-06, |
| "loss": 0.0832, |
| "step": 1313 |
| }, |
| { |
| "epoch": 11.23076923076923, |
| "grad_norm": 2.1999545097351074, |
| "learning_rate": 4.384615384615385e-06, |
| "loss": 0.1121, |
| "step": 1314 |
| }, |
| { |
| "epoch": 11.239316239316238, |
| "grad_norm": 3.3256099224090576, |
| "learning_rate": 4.380341880341881e-06, |
| "loss": 0.1288, |
| "step": 1315 |
| }, |
| { |
| "epoch": 11.247863247863247, |
| "grad_norm": 2.6664986610412598, |
| "learning_rate": 4.376068376068377e-06, |
| "loss": 0.1044, |
| "step": 1316 |
| }, |
| { |
| "epoch": 11.256410256410255, |
| "grad_norm": 4.103114604949951, |
| "learning_rate": 4.371794871794872e-06, |
| "loss": 0.3115, |
| "step": 1317 |
| }, |
| { |
| "epoch": 11.264957264957266, |
| "grad_norm": 2.717532157897949, |
| "learning_rate": 4.367521367521368e-06, |
| "loss": 0.1144, |
| "step": 1318 |
| }, |
| { |
| "epoch": 11.273504273504274, |
| "grad_norm": 2.7918317317962646, |
| "learning_rate": 4.3632478632478635e-06, |
| "loss": 0.1205, |
| "step": 1319 |
| }, |
| { |
| "epoch": 11.282051282051283, |
| "grad_norm": 2.439854383468628, |
| "learning_rate": 4.358974358974359e-06, |
| "loss": 0.05, |
| "step": 1320 |
| }, |
| { |
| "epoch": 11.290598290598291, |
| "grad_norm": 1.3528865575790405, |
| "learning_rate": 4.354700854700855e-06, |
| "loss": 0.0437, |
| "step": 1321 |
| }, |
| { |
| "epoch": 11.2991452991453, |
| "grad_norm": 3.3273401260375977, |
| "learning_rate": 4.350427350427351e-06, |
| "loss": 0.1417, |
| "step": 1322 |
| }, |
| { |
| "epoch": 11.307692307692308, |
| "grad_norm": 4.022815704345703, |
| "learning_rate": 4.346153846153846e-06, |
| "loss": 0.0845, |
| "step": 1323 |
| }, |
| { |
| "epoch": 11.316239316239317, |
| "grad_norm": 5.169338703155518, |
| "learning_rate": 4.341880341880342e-06, |
| "loss": 0.5235, |
| "step": 1324 |
| }, |
| { |
| "epoch": 11.324786324786325, |
| "grad_norm": 1.8199687004089355, |
| "learning_rate": 4.337606837606838e-06, |
| "loss": 0.0399, |
| "step": 1325 |
| }, |
| { |
| "epoch": 11.333333333333334, |
| "grad_norm": 3.3616087436676025, |
| "learning_rate": 4.333333333333334e-06, |
| "loss": 0.1428, |
| "step": 1326 |
| }, |
| { |
| "epoch": 11.341880341880342, |
| "grad_norm": 14.056232452392578, |
| "learning_rate": 4.32905982905983e-06, |
| "loss": 0.2921, |
| "step": 1327 |
| }, |
| { |
| "epoch": 11.350427350427351, |
| "grad_norm": 2.3905317783355713, |
| "learning_rate": 4.324786324786326e-06, |
| "loss": 0.0478, |
| "step": 1328 |
| }, |
| { |
| "epoch": 11.35897435897436, |
| "grad_norm": 9.876815795898438, |
| "learning_rate": 4.320512820512821e-06, |
| "loss": 0.1926, |
| "step": 1329 |
| }, |
| { |
| "epoch": 11.367521367521368, |
| "grad_norm": 1.3726049661636353, |
| "learning_rate": 4.316239316239317e-06, |
| "loss": 0.0416, |
| "step": 1330 |
| }, |
| { |
| "epoch": 11.376068376068377, |
| "grad_norm": 3.0890841484069824, |
| "learning_rate": 4.3119658119658125e-06, |
| "loss": 0.0614, |
| "step": 1331 |
| }, |
| { |
| "epoch": 11.384615384615385, |
| "grad_norm": 2.858560562133789, |
| "learning_rate": 4.307692307692308e-06, |
| "loss": 0.2068, |
| "step": 1332 |
| }, |
| { |
| "epoch": 11.393162393162394, |
| "grad_norm": 4.6819658279418945, |
| "learning_rate": 4.303418803418804e-06, |
| "loss": 0.5773, |
| "step": 1333 |
| }, |
| { |
| "epoch": 11.401709401709402, |
| "grad_norm": 1.741450548171997, |
| "learning_rate": 4.299145299145299e-06, |
| "loss": 0.0505, |
| "step": 1334 |
| }, |
| { |
| "epoch": 11.41025641025641, |
| "grad_norm": 3.5882327556610107, |
| "learning_rate": 4.294871794871795e-06, |
| "loss": 0.1797, |
| "step": 1335 |
| }, |
| { |
| "epoch": 11.418803418803419, |
| "grad_norm": 3.59714937210083, |
| "learning_rate": 4.29059829059829e-06, |
| "loss": 0.1531, |
| "step": 1336 |
| }, |
| { |
| "epoch": 11.427350427350428, |
| "grad_norm": 3.619572877883911, |
| "learning_rate": 4.286324786324786e-06, |
| "loss": 0.1028, |
| "step": 1337 |
| }, |
| { |
| "epoch": 11.435897435897436, |
| "grad_norm": 3.9230782985687256, |
| "learning_rate": 4.282051282051282e-06, |
| "loss": 0.2404, |
| "step": 1338 |
| }, |
| { |
| "epoch": 11.444444444444445, |
| "grad_norm": 3.6987717151641846, |
| "learning_rate": 4.277777777777778e-06, |
| "loss": 0.1795, |
| "step": 1339 |
| }, |
| { |
| "epoch": 11.452991452991453, |
| "grad_norm": 3.322707176208496, |
| "learning_rate": 4.273504273504274e-06, |
| "loss": 0.0968, |
| "step": 1340 |
| }, |
| { |
| "epoch": 11.461538461538462, |
| "grad_norm": 1.2378501892089844, |
| "learning_rate": 4.26923076923077e-06, |
| "loss": 0.0387, |
| "step": 1341 |
| }, |
| { |
| "epoch": 11.47008547008547, |
| "grad_norm": 2.6801578998565674, |
| "learning_rate": 4.264957264957265e-06, |
| "loss": 0.0475, |
| "step": 1342 |
| }, |
| { |
| "epoch": 11.478632478632479, |
| "grad_norm": 2.2003352642059326, |
| "learning_rate": 4.260683760683761e-06, |
| "loss": 0.0505, |
| "step": 1343 |
| }, |
| { |
| "epoch": 11.487179487179487, |
| "grad_norm": 1.701341152191162, |
| "learning_rate": 4.2564102564102566e-06, |
| "loss": 0.064, |
| "step": 1344 |
| }, |
| { |
| "epoch": 11.495726495726496, |
| "grad_norm": 9.939803123474121, |
| "learning_rate": 4.2521367521367524e-06, |
| "loss": 0.461, |
| "step": 1345 |
| }, |
| { |
| "epoch": 11.504273504273504, |
| "grad_norm": 3.2999305725097656, |
| "learning_rate": 4.247863247863248e-06, |
| "loss": 0.1653, |
| "step": 1346 |
| }, |
| { |
| "epoch": 11.512820512820513, |
| "grad_norm": 3.9968252182006836, |
| "learning_rate": 4.243589743589744e-06, |
| "loss": 0.123, |
| "step": 1347 |
| }, |
| { |
| "epoch": 11.521367521367521, |
| "grad_norm": 2.846968173980713, |
| "learning_rate": 4.239316239316239e-06, |
| "loss": 0.1161, |
| "step": 1348 |
| }, |
| { |
| "epoch": 11.52991452991453, |
| "grad_norm": 4.328092575073242, |
| "learning_rate": 4.235042735042735e-06, |
| "loss": 0.065, |
| "step": 1349 |
| }, |
| { |
| "epoch": 11.538461538461538, |
| "grad_norm": 3.649003267288208, |
| "learning_rate": 4.230769230769231e-06, |
| "loss": 0.1919, |
| "step": 1350 |
| }, |
| { |
| "epoch": 11.547008547008547, |
| "grad_norm": 4.094634056091309, |
| "learning_rate": 4.226495726495727e-06, |
| "loss": 0.1728, |
| "step": 1351 |
| }, |
| { |
| "epoch": 11.555555555555555, |
| "grad_norm": 2.3904240131378174, |
| "learning_rate": 4.222222222222223e-06, |
| "loss": 0.105, |
| "step": 1352 |
| }, |
| { |
| "epoch": 11.564102564102564, |
| "grad_norm": 1.8493746519088745, |
| "learning_rate": 4.217948717948718e-06, |
| "loss": 0.0373, |
| "step": 1353 |
| }, |
| { |
| "epoch": 11.572649572649572, |
| "grad_norm": 4.690928936004639, |
| "learning_rate": 4.213675213675214e-06, |
| "loss": 0.3405, |
| "step": 1354 |
| }, |
| { |
| "epoch": 11.581196581196581, |
| "grad_norm": 6.808948516845703, |
| "learning_rate": 4.20940170940171e-06, |
| "loss": 0.1308, |
| "step": 1355 |
| }, |
| { |
| "epoch": 11.58974358974359, |
| "grad_norm": 6.060946464538574, |
| "learning_rate": 4.2051282051282055e-06, |
| "loss": 0.1494, |
| "step": 1356 |
| }, |
| { |
| "epoch": 11.598290598290598, |
| "grad_norm": 1.5923279523849487, |
| "learning_rate": 4.2008547008547014e-06, |
| "loss": 0.044, |
| "step": 1357 |
| }, |
| { |
| "epoch": 11.606837606837606, |
| "grad_norm": 1.7796354293823242, |
| "learning_rate": 4.196581196581197e-06, |
| "loss": 0.0558, |
| "step": 1358 |
| }, |
| { |
| "epoch": 11.615384615384615, |
| "grad_norm": 1.2209490537643433, |
| "learning_rate": 4.192307692307692e-06, |
| "loss": 0.0492, |
| "step": 1359 |
| }, |
| { |
| "epoch": 11.623931623931623, |
| "grad_norm": 4.0859880447387695, |
| "learning_rate": 4.188034188034188e-06, |
| "loss": 0.0759, |
| "step": 1360 |
| }, |
| { |
| "epoch": 11.632478632478632, |
| "grad_norm": 3.5021755695343018, |
| "learning_rate": 4.183760683760684e-06, |
| "loss": 0.1263, |
| "step": 1361 |
| }, |
| { |
| "epoch": 11.64102564102564, |
| "grad_norm": 2.5915517807006836, |
| "learning_rate": 4.17948717948718e-06, |
| "loss": 0.1949, |
| "step": 1362 |
| }, |
| { |
| "epoch": 11.649572649572649, |
| "grad_norm": 2.8024656772613525, |
| "learning_rate": 4.175213675213676e-06, |
| "loss": 0.2325, |
| "step": 1363 |
| }, |
| { |
| "epoch": 11.658119658119658, |
| "grad_norm": 5.795172691345215, |
| "learning_rate": 4.170940170940172e-06, |
| "loss": 0.3253, |
| "step": 1364 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 5.056031227111816, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.102, |
| "step": 1365 |
| }, |
| { |
| "epoch": 11.675213675213675, |
| "grad_norm": 6.092950820922852, |
| "learning_rate": 4.162393162393163e-06, |
| "loss": 0.1938, |
| "step": 1366 |
| }, |
| { |
| "epoch": 11.683760683760683, |
| "grad_norm": 4.44755744934082, |
| "learning_rate": 4.158119658119659e-06, |
| "loss": 0.1588, |
| "step": 1367 |
| }, |
| { |
| "epoch": 11.692307692307692, |
| "grad_norm": 171.19509887695312, |
| "learning_rate": 4.1538461538461545e-06, |
| "loss": 0.3077, |
| "step": 1368 |
| }, |
| { |
| "epoch": 11.7008547008547, |
| "grad_norm": 13.992602348327637, |
| "learning_rate": 4.1495726495726504e-06, |
| "loss": 0.4401, |
| "step": 1369 |
| }, |
| { |
| "epoch": 11.709401709401709, |
| "grad_norm": 2.2174923419952393, |
| "learning_rate": 4.145299145299146e-06, |
| "loss": 0.1751, |
| "step": 1370 |
| }, |
| { |
| "epoch": 11.717948717948717, |
| "grad_norm": 2.031663179397583, |
| "learning_rate": 4.141025641025641e-06, |
| "loss": 0.049, |
| "step": 1371 |
| }, |
| { |
| "epoch": 11.726495726495726, |
| "grad_norm": 4.201449394226074, |
| "learning_rate": 4.136752136752136e-06, |
| "loss": 0.1016, |
| "step": 1372 |
| }, |
| { |
| "epoch": 11.735042735042736, |
| "grad_norm": 3.953226089477539, |
| "learning_rate": 4.132478632478632e-06, |
| "loss": 0.1336, |
| "step": 1373 |
| }, |
| { |
| "epoch": 11.743589743589745, |
| "grad_norm": 1.4856081008911133, |
| "learning_rate": 4.128205128205128e-06, |
| "loss": 0.0537, |
| "step": 1374 |
| }, |
| { |
| "epoch": 11.752136752136753, |
| "grad_norm": 1.2989288568496704, |
| "learning_rate": 4.123931623931624e-06, |
| "loss": 0.0351, |
| "step": 1375 |
| }, |
| { |
| "epoch": 11.760683760683762, |
| "grad_norm": 4.335974216461182, |
| "learning_rate": 4.11965811965812e-06, |
| "loss": 0.0722, |
| "step": 1376 |
| }, |
| { |
| "epoch": 11.76923076923077, |
| "grad_norm": 6.298306941986084, |
| "learning_rate": 4.115384615384616e-06, |
| "loss": 0.2359, |
| "step": 1377 |
| }, |
| { |
| "epoch": 11.777777777777779, |
| "grad_norm": 0.7119566798210144, |
| "learning_rate": 4.111111111111111e-06, |
| "loss": 0.0192, |
| "step": 1378 |
| }, |
| { |
| "epoch": 11.786324786324787, |
| "grad_norm": 2.7993624210357666, |
| "learning_rate": 4.106837606837607e-06, |
| "loss": 0.0605, |
| "step": 1379 |
| }, |
| { |
| "epoch": 11.794871794871796, |
| "grad_norm": 6.566782474517822, |
| "learning_rate": 4.102564102564103e-06, |
| "loss": 0.3883, |
| "step": 1380 |
| }, |
| { |
| "epoch": 11.803418803418804, |
| "grad_norm": 8.177830696105957, |
| "learning_rate": 4.0982905982905986e-06, |
| "loss": 0.257, |
| "step": 1381 |
| }, |
| { |
| "epoch": 11.811965811965813, |
| "grad_norm": 4.04230260848999, |
| "learning_rate": 4.0940170940170945e-06, |
| "loss": 0.0943, |
| "step": 1382 |
| }, |
| { |
| "epoch": 11.820512820512821, |
| "grad_norm": 3.595386505126953, |
| "learning_rate": 4.08974358974359e-06, |
| "loss": 0.0533, |
| "step": 1383 |
| }, |
| { |
| "epoch": 11.82905982905983, |
| "grad_norm": 3.755312204360962, |
| "learning_rate": 4.085470085470085e-06, |
| "loss": 0.0468, |
| "step": 1384 |
| }, |
| { |
| "epoch": 11.837606837606838, |
| "grad_norm": 2.0697362422943115, |
| "learning_rate": 4.081196581196581e-06, |
| "loss": 0.063, |
| "step": 1385 |
| }, |
| { |
| "epoch": 11.846153846153847, |
| "grad_norm": 7.690021991729736, |
| "learning_rate": 4.076923076923077e-06, |
| "loss": 0.2415, |
| "step": 1386 |
| }, |
| { |
| "epoch": 11.854700854700855, |
| "grad_norm": 3.0239031314849854, |
| "learning_rate": 4.072649572649573e-06, |
| "loss": 0.1257, |
| "step": 1387 |
| }, |
| { |
| "epoch": 11.863247863247864, |
| "grad_norm": 2.263847589492798, |
| "learning_rate": 4.068376068376069e-06, |
| "loss": 0.132, |
| "step": 1388 |
| }, |
| { |
| "epoch": 11.871794871794872, |
| "grad_norm": 2.9513261318206787, |
| "learning_rate": 4.064102564102565e-06, |
| "loss": 0.1229, |
| "step": 1389 |
| }, |
| { |
| "epoch": 11.88034188034188, |
| "grad_norm": 3.03973388671875, |
| "learning_rate": 4.05982905982906e-06, |
| "loss": 0.0966, |
| "step": 1390 |
| }, |
| { |
| "epoch": 11.88888888888889, |
| "grad_norm": 1.0075026750564575, |
| "learning_rate": 4.055555555555556e-06, |
| "loss": 0.0284, |
| "step": 1391 |
| }, |
| { |
| "epoch": 11.897435897435898, |
| "grad_norm": 1.5330802202224731, |
| "learning_rate": 4.051282051282052e-06, |
| "loss": 0.0614, |
| "step": 1392 |
| }, |
| { |
| "epoch": 11.905982905982906, |
| "grad_norm": 3.6498589515686035, |
| "learning_rate": 4.0470085470085476e-06, |
| "loss": 0.2236, |
| "step": 1393 |
| }, |
| { |
| "epoch": 11.914529914529915, |
| "grad_norm": 4.659658908843994, |
| "learning_rate": 4.0427350427350435e-06, |
| "loss": 0.3245, |
| "step": 1394 |
| }, |
| { |
| "epoch": 11.923076923076923, |
| "grad_norm": 3.921703815460205, |
| "learning_rate": 4.0384615384615385e-06, |
| "loss": 0.2981, |
| "step": 1395 |
| }, |
| { |
| "epoch": 11.931623931623932, |
| "grad_norm": 5.816749572753906, |
| "learning_rate": 4.034188034188034e-06, |
| "loss": 0.1606, |
| "step": 1396 |
| }, |
| { |
| "epoch": 11.94017094017094, |
| "grad_norm": 1.2831742763519287, |
| "learning_rate": 4.02991452991453e-06, |
| "loss": 0.0307, |
| "step": 1397 |
| }, |
| { |
| "epoch": 11.948717948717949, |
| "grad_norm": 5.745227813720703, |
| "learning_rate": 4.025641025641026e-06, |
| "loss": 0.5323, |
| "step": 1398 |
| }, |
| { |
| "epoch": 11.957264957264957, |
| "grad_norm": 2.4196462631225586, |
| "learning_rate": 4.021367521367522e-06, |
| "loss": 0.09, |
| "step": 1399 |
| }, |
| { |
| "epoch": 11.965811965811966, |
| "grad_norm": 8.084505081176758, |
| "learning_rate": 4.017094017094018e-06, |
| "loss": 0.2991, |
| "step": 1400 |
| }, |
| { |
| "epoch": 11.974358974358974, |
| "grad_norm": 3.786708116531372, |
| "learning_rate": 4.012820512820513e-06, |
| "loss": 0.2163, |
| "step": 1401 |
| }, |
| { |
| "epoch": 11.982905982905983, |
| "grad_norm": 4.76535701751709, |
| "learning_rate": 4.008547008547009e-06, |
| "loss": 0.2453, |
| "step": 1402 |
| }, |
| { |
| "epoch": 11.991452991452991, |
| "grad_norm": 7.380269527435303, |
| "learning_rate": 4.004273504273505e-06, |
| "loss": 0.3525, |
| "step": 1403 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 41.21335983276367, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.2139, |
| "step": 1404 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.07730000466108322, |
| "eval_runtime": 9.2426, |
| "eval_samples_per_second": 50.419, |
| "eval_steps_per_second": 6.383, |
| "step": 1404 |
| }, |
| { |
| "epoch": 12.008547008547009, |
| "grad_norm": 2.3692574501037598, |
| "learning_rate": 3.9957264957264966e-06, |
| "loss": 0.0939, |
| "step": 1405 |
| }, |
| { |
| "epoch": 12.017094017094017, |
| "grad_norm": 8.087658882141113, |
| "learning_rate": 3.9914529914529924e-06, |
| "loss": 0.2801, |
| "step": 1406 |
| }, |
| { |
| "epoch": 12.025641025641026, |
| "grad_norm": 8.448614120483398, |
| "learning_rate": 3.9871794871794875e-06, |
| "loss": 0.2069, |
| "step": 1407 |
| }, |
| { |
| "epoch": 12.034188034188034, |
| "grad_norm": 1.8581651449203491, |
| "learning_rate": 3.982905982905983e-06, |
| "loss": 0.0509, |
| "step": 1408 |
| }, |
| { |
| "epoch": 12.042735042735043, |
| "grad_norm": 1.711654543876648, |
| "learning_rate": 3.9786324786324784e-06, |
| "loss": 0.0464, |
| "step": 1409 |
| }, |
| { |
| "epoch": 12.051282051282051, |
| "grad_norm": 1.482553482055664, |
| "learning_rate": 3.974358974358974e-06, |
| "loss": 0.028, |
| "step": 1410 |
| }, |
| { |
| "epoch": 12.05982905982906, |
| "grad_norm": 8.005542755126953, |
| "learning_rate": 3.97008547008547e-06, |
| "loss": 0.2587, |
| "step": 1411 |
| }, |
| { |
| "epoch": 12.068376068376068, |
| "grad_norm": 2.1153948307037354, |
| "learning_rate": 3.965811965811966e-06, |
| "loss": 0.0563, |
| "step": 1412 |
| }, |
| { |
| "epoch": 12.076923076923077, |
| "grad_norm": 7.791186809539795, |
| "learning_rate": 3.961538461538462e-06, |
| "loss": 0.0587, |
| "step": 1413 |
| }, |
| { |
| "epoch": 12.085470085470085, |
| "grad_norm": 21.04537582397461, |
| "learning_rate": 3.957264957264957e-06, |
| "loss": 0.252, |
| "step": 1414 |
| }, |
| { |
| "epoch": 12.094017094017094, |
| "grad_norm": 3.144742727279663, |
| "learning_rate": 3.952991452991453e-06, |
| "loss": 0.2207, |
| "step": 1415 |
| }, |
| { |
| "epoch": 12.102564102564102, |
| "grad_norm": 2.23223614692688, |
| "learning_rate": 3.948717948717949e-06, |
| "loss": 0.0923, |
| "step": 1416 |
| }, |
| { |
| "epoch": 12.11111111111111, |
| "grad_norm": 3.5652217864990234, |
| "learning_rate": 3.944444444444445e-06, |
| "loss": 0.2197, |
| "step": 1417 |
| }, |
| { |
| "epoch": 12.11965811965812, |
| "grad_norm": 3.1105499267578125, |
| "learning_rate": 3.940170940170941e-06, |
| "loss": 0.071, |
| "step": 1418 |
| }, |
| { |
| "epoch": 12.128205128205128, |
| "grad_norm": 2.525405168533325, |
| "learning_rate": 3.9358974358974365e-06, |
| "loss": 0.0874, |
| "step": 1419 |
| }, |
| { |
| "epoch": 12.136752136752136, |
| "grad_norm": 4.479174613952637, |
| "learning_rate": 3.9316239316239315e-06, |
| "loss": 0.1872, |
| "step": 1420 |
| }, |
| { |
| "epoch": 12.145299145299145, |
| "grad_norm": 2.0484113693237305, |
| "learning_rate": 3.927350427350427e-06, |
| "loss": 0.0739, |
| "step": 1421 |
| }, |
| { |
| "epoch": 12.153846153846153, |
| "grad_norm": 2.014679431915283, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 0.1089, |
| "step": 1422 |
| }, |
| { |
| "epoch": 12.162393162393162, |
| "grad_norm": 4.71014404296875, |
| "learning_rate": 3.918803418803419e-06, |
| "loss": 0.3136, |
| "step": 1423 |
| }, |
| { |
| "epoch": 12.17094017094017, |
| "grad_norm": 2.1372437477111816, |
| "learning_rate": 3.914529914529915e-06, |
| "loss": 0.0458, |
| "step": 1424 |
| }, |
| { |
| "epoch": 12.179487179487179, |
| "grad_norm": 1.4595564603805542, |
| "learning_rate": 3.910256410256411e-06, |
| "loss": 0.0601, |
| "step": 1425 |
| }, |
| { |
| "epoch": 12.188034188034187, |
| "grad_norm": 4.45602560043335, |
| "learning_rate": 3.905982905982906e-06, |
| "loss": 0.091, |
| "step": 1426 |
| }, |
| { |
| "epoch": 12.196581196581196, |
| "grad_norm": 1.473585844039917, |
| "learning_rate": 3.901709401709402e-06, |
| "loss": 0.0515, |
| "step": 1427 |
| }, |
| { |
| "epoch": 12.205128205128204, |
| "grad_norm": 1.8761534690856934, |
| "learning_rate": 3.897435897435898e-06, |
| "loss": 0.055, |
| "step": 1428 |
| }, |
| { |
| "epoch": 12.213675213675213, |
| "grad_norm": 0.7121579647064209, |
| "learning_rate": 3.893162393162394e-06, |
| "loss": 0.0197, |
| "step": 1429 |
| }, |
| { |
| "epoch": 12.222222222222221, |
| "grad_norm": 2.0035219192504883, |
| "learning_rate": 3.88888888888889e-06, |
| "loss": 0.0904, |
| "step": 1430 |
| }, |
| { |
| "epoch": 12.23076923076923, |
| "grad_norm": 3.820181369781494, |
| "learning_rate": 3.884615384615385e-06, |
| "loss": 0.2415, |
| "step": 1431 |
| }, |
| { |
| "epoch": 12.239316239316238, |
| "grad_norm": 3.40633225440979, |
| "learning_rate": 3.8803418803418805e-06, |
| "loss": 0.0593, |
| "step": 1432 |
| }, |
| { |
| "epoch": 12.247863247863247, |
| "grad_norm": 7.093897342681885, |
| "learning_rate": 3.876068376068376e-06, |
| "loss": 0.2504, |
| "step": 1433 |
| }, |
| { |
| "epoch": 12.256410256410255, |
| "grad_norm": 2.1057517528533936, |
| "learning_rate": 3.871794871794872e-06, |
| "loss": 0.0573, |
| "step": 1434 |
| }, |
| { |
| "epoch": 12.264957264957266, |
| "grad_norm": 4.797401428222656, |
| "learning_rate": 3.867521367521368e-06, |
| "loss": 0.338, |
| "step": 1435 |
| }, |
| { |
| "epoch": 12.273504273504274, |
| "grad_norm": 20.711339950561523, |
| "learning_rate": 3.863247863247864e-06, |
| "loss": 0.1964, |
| "step": 1436 |
| }, |
| { |
| "epoch": 12.282051282051283, |
| "grad_norm": 2.725280523300171, |
| "learning_rate": 3.858974358974359e-06, |
| "loss": 0.1837, |
| "step": 1437 |
| }, |
| { |
| "epoch": 12.290598290598291, |
| "grad_norm": 0.9469479322433472, |
| "learning_rate": 3.854700854700855e-06, |
| "loss": 0.0231, |
| "step": 1438 |
| }, |
| { |
| "epoch": 12.2991452991453, |
| "grad_norm": 2.0424935817718506, |
| "learning_rate": 3.850427350427351e-06, |
| "loss": 0.1373, |
| "step": 1439 |
| }, |
| { |
| "epoch": 12.307692307692308, |
| "grad_norm": 1.4781558513641357, |
| "learning_rate": 3.846153846153847e-06, |
| "loss": 0.0393, |
| "step": 1440 |
| }, |
| { |
| "epoch": 12.316239316239317, |
| "grad_norm": 3.7576427459716797, |
| "learning_rate": 3.841880341880343e-06, |
| "loss": 0.1134, |
| "step": 1441 |
| }, |
| { |
| "epoch": 12.324786324786325, |
| "grad_norm": 299.5986633300781, |
| "learning_rate": 3.8376068376068386e-06, |
| "loss": 0.8017, |
| "step": 1442 |
| }, |
| { |
| "epoch": 12.333333333333334, |
| "grad_norm": 3.109199047088623, |
| "learning_rate": 3.833333333333334e-06, |
| "loss": 0.1014, |
| "step": 1443 |
| }, |
| { |
| "epoch": 12.341880341880342, |
| "grad_norm": 6.353960990905762, |
| "learning_rate": 3.8290598290598295e-06, |
| "loss": 0.3484, |
| "step": 1444 |
| }, |
| { |
| "epoch": 12.350427350427351, |
| "grad_norm": 12.957517623901367, |
| "learning_rate": 3.8247863247863246e-06, |
| "loss": 0.5644, |
| "step": 1445 |
| }, |
| { |
| "epoch": 12.35897435897436, |
| "grad_norm": 10.197676658630371, |
| "learning_rate": 3.8205128205128204e-06, |
| "loss": 0.1525, |
| "step": 1446 |
| }, |
| { |
| "epoch": 12.367521367521368, |
| "grad_norm": 1.7754546403884888, |
| "learning_rate": 3.816239316239316e-06, |
| "loss": 0.0259, |
| "step": 1447 |
| }, |
| { |
| "epoch": 12.376068376068377, |
| "grad_norm": 1.4237226247787476, |
| "learning_rate": 3.8119658119658122e-06, |
| "loss": 0.0307, |
| "step": 1448 |
| }, |
| { |
| "epoch": 12.384615384615385, |
| "grad_norm": 2.94474458694458, |
| "learning_rate": 3.8076923076923077e-06, |
| "loss": 0.1447, |
| "step": 1449 |
| }, |
| { |
| "epoch": 12.393162393162394, |
| "grad_norm": 3.7823615074157715, |
| "learning_rate": 3.8034188034188036e-06, |
| "loss": 0.071, |
| "step": 1450 |
| }, |
| { |
| "epoch": 12.401709401709402, |
| "grad_norm": 7.5281081199646, |
| "learning_rate": 3.7991452991452995e-06, |
| "loss": 0.1805, |
| "step": 1451 |
| }, |
| { |
| "epoch": 12.41025641025641, |
| "grad_norm": 2.523592233657837, |
| "learning_rate": 3.794871794871795e-06, |
| "loss": 0.0684, |
| "step": 1452 |
| }, |
| { |
| "epoch": 12.418803418803419, |
| "grad_norm": 2.423443078994751, |
| "learning_rate": 3.790598290598291e-06, |
| "loss": 0.0726, |
| "step": 1453 |
| }, |
| { |
| "epoch": 12.427350427350428, |
| "grad_norm": 6.3336005210876465, |
| "learning_rate": 3.7863247863247863e-06, |
| "loss": 0.1684, |
| "step": 1454 |
| }, |
| { |
| "epoch": 12.435897435897436, |
| "grad_norm": 248.31146240234375, |
| "learning_rate": 3.782051282051282e-06, |
| "loss": 0.6863, |
| "step": 1455 |
| }, |
| { |
| "epoch": 12.444444444444445, |
| "grad_norm": 3.0117695331573486, |
| "learning_rate": 3.777777777777778e-06, |
| "loss": 0.217, |
| "step": 1456 |
| }, |
| { |
| "epoch": 12.452991452991453, |
| "grad_norm": 1.4753539562225342, |
| "learning_rate": 3.7735042735042735e-06, |
| "loss": 0.0623, |
| "step": 1457 |
| }, |
| { |
| "epoch": 12.461538461538462, |
| "grad_norm": 2.095745325088501, |
| "learning_rate": 3.7692307692307694e-06, |
| "loss": 0.055, |
| "step": 1458 |
| }, |
| { |
| "epoch": 12.47008547008547, |
| "grad_norm": 3.508305788040161, |
| "learning_rate": 3.7649572649572653e-06, |
| "loss": 0.1097, |
| "step": 1459 |
| }, |
| { |
| "epoch": 12.478632478632479, |
| "grad_norm": 3.0965282917022705, |
| "learning_rate": 3.760683760683761e-06, |
| "loss": 0.3374, |
| "step": 1460 |
| }, |
| { |
| "epoch": 12.487179487179487, |
| "grad_norm": 0.7286785244941711, |
| "learning_rate": 3.7564102564102567e-06, |
| "loss": 0.0182, |
| "step": 1461 |
| }, |
| { |
| "epoch": 12.495726495726496, |
| "grad_norm": 5.957888126373291, |
| "learning_rate": 3.7521367521367526e-06, |
| "loss": 0.3498, |
| "step": 1462 |
| }, |
| { |
| "epoch": 12.504273504273504, |
| "grad_norm": 10.433263778686523, |
| "learning_rate": 3.747863247863248e-06, |
| "loss": 0.446, |
| "step": 1463 |
| }, |
| { |
| "epoch": 12.512820512820513, |
| "grad_norm": 4.565568923950195, |
| "learning_rate": 3.743589743589744e-06, |
| "loss": 0.1026, |
| "step": 1464 |
| }, |
| { |
| "epoch": 12.521367521367521, |
| "grad_norm": 2.607106924057007, |
| "learning_rate": 3.73931623931624e-06, |
| "loss": 0.0912, |
| "step": 1465 |
| }, |
| { |
| "epoch": 12.52991452991453, |
| "grad_norm": 2.415541410446167, |
| "learning_rate": 3.7350427350427353e-06, |
| "loss": 0.0594, |
| "step": 1466 |
| }, |
| { |
| "epoch": 12.538461538461538, |
| "grad_norm": 7.978870868682861, |
| "learning_rate": 3.730769230769231e-06, |
| "loss": 0.2617, |
| "step": 1467 |
| }, |
| { |
| "epoch": 12.547008547008547, |
| "grad_norm": 6.858293056488037, |
| "learning_rate": 3.726495726495727e-06, |
| "loss": 0.3642, |
| "step": 1468 |
| }, |
| { |
| "epoch": 12.555555555555555, |
| "grad_norm": 1.3900551795959473, |
| "learning_rate": 3.7222222222222225e-06, |
| "loss": 0.0445, |
| "step": 1469 |
| }, |
| { |
| "epoch": 12.564102564102564, |
| "grad_norm": 8.111970901489258, |
| "learning_rate": 3.7179487179487184e-06, |
| "loss": 0.1828, |
| "step": 1470 |
| }, |
| { |
| "epoch": 12.572649572649572, |
| "grad_norm": 2.731841802597046, |
| "learning_rate": 3.7136752136752143e-06, |
| "loss": 0.2027, |
| "step": 1471 |
| }, |
| { |
| "epoch": 12.581196581196581, |
| "grad_norm": 4.418527126312256, |
| "learning_rate": 3.7094017094017098e-06, |
| "loss": 0.1744, |
| "step": 1472 |
| }, |
| { |
| "epoch": 12.58974358974359, |
| "grad_norm": 2.8263015747070312, |
| "learning_rate": 3.7051282051282057e-06, |
| "loss": 0.1123, |
| "step": 1473 |
| }, |
| { |
| "epoch": 12.598290598290598, |
| "grad_norm": 2.3524725437164307, |
| "learning_rate": 3.700854700854701e-06, |
| "loss": 0.0999, |
| "step": 1474 |
| }, |
| { |
| "epoch": 12.606837606837606, |
| "grad_norm": 9.863709449768066, |
| "learning_rate": 3.696581196581197e-06, |
| "loss": 0.4589, |
| "step": 1475 |
| }, |
| { |
| "epoch": 12.615384615384615, |
| "grad_norm": 3.5506396293640137, |
| "learning_rate": 3.692307692307693e-06, |
| "loss": 0.2034, |
| "step": 1476 |
| }, |
| { |
| "epoch": 12.623931623931623, |
| "grad_norm": 2.4352779388427734, |
| "learning_rate": 3.6880341880341884e-06, |
| "loss": 0.0806, |
| "step": 1477 |
| }, |
| { |
| "epoch": 12.632478632478632, |
| "grad_norm": 1.8339797258377075, |
| "learning_rate": 3.6837606837606843e-06, |
| "loss": 0.0635, |
| "step": 1478 |
| }, |
| { |
| "epoch": 12.64102564102564, |
| "grad_norm": 4.63474178314209, |
| "learning_rate": 3.67948717948718e-06, |
| "loss": 0.4568, |
| "step": 1479 |
| }, |
| { |
| "epoch": 12.649572649572649, |
| "grad_norm": 7.696872711181641, |
| "learning_rate": 3.6752136752136756e-06, |
| "loss": 0.1769, |
| "step": 1480 |
| }, |
| { |
| "epoch": 12.658119658119658, |
| "grad_norm": 1.3894271850585938, |
| "learning_rate": 3.670940170940171e-06, |
| "loss": 0.0747, |
| "step": 1481 |
| }, |
| { |
| "epoch": 12.666666666666666, |
| "grad_norm": 5.607828140258789, |
| "learning_rate": 3.6666666666666666e-06, |
| "loss": 0.1178, |
| "step": 1482 |
| }, |
| { |
| "epoch": 12.675213675213675, |
| "grad_norm": 2.120594024658203, |
| "learning_rate": 3.6623931623931625e-06, |
| "loss": 0.0497, |
| "step": 1483 |
| }, |
| { |
| "epoch": 12.683760683760683, |
| "grad_norm": 1.359381914138794, |
| "learning_rate": 3.6581196581196584e-06, |
| "loss": 0.035, |
| "step": 1484 |
| }, |
| { |
| "epoch": 12.692307692307692, |
| "grad_norm": 2.8533923625946045, |
| "learning_rate": 3.653846153846154e-06, |
| "loss": 0.1048, |
| "step": 1485 |
| }, |
| { |
| "epoch": 12.7008547008547, |
| "grad_norm": 6.021198749542236, |
| "learning_rate": 3.6495726495726497e-06, |
| "loss": 0.1604, |
| "step": 1486 |
| }, |
| { |
| "epoch": 12.709401709401709, |
| "grad_norm": 7.198216915130615, |
| "learning_rate": 3.6452991452991456e-06, |
| "loss": 0.1656, |
| "step": 1487 |
| }, |
| { |
| "epoch": 12.717948717948717, |
| "grad_norm": 1.4581981897354126, |
| "learning_rate": 3.641025641025641e-06, |
| "loss": 0.0398, |
| "step": 1488 |
| }, |
| { |
| "epoch": 12.726495726495726, |
| "grad_norm": 30.704627990722656, |
| "learning_rate": 3.636752136752137e-06, |
| "loss": 0.3371, |
| "step": 1489 |
| }, |
| { |
| "epoch": 12.735042735042736, |
| "grad_norm": 2.5204057693481445, |
| "learning_rate": 3.632478632478633e-06, |
| "loss": 0.0742, |
| "step": 1490 |
| }, |
| { |
| "epoch": 12.743589743589745, |
| "grad_norm": 2.3917508125305176, |
| "learning_rate": 3.6282051282051283e-06, |
| "loss": 0.1681, |
| "step": 1491 |
| }, |
| { |
| "epoch": 12.752136752136753, |
| "grad_norm": 1.4529337882995605, |
| "learning_rate": 3.623931623931624e-06, |
| "loss": 0.0247, |
| "step": 1492 |
| }, |
| { |
| "epoch": 12.760683760683762, |
| "grad_norm": 31.894805908203125, |
| "learning_rate": 3.6196581196581197e-06, |
| "loss": 0.2222, |
| "step": 1493 |
| }, |
| { |
| "epoch": 12.76923076923077, |
| "grad_norm": 3.4240164756774902, |
| "learning_rate": 3.6153846153846156e-06, |
| "loss": 0.1432, |
| "step": 1494 |
| }, |
| { |
| "epoch": 12.777777777777779, |
| "grad_norm": 2.0000102519989014, |
| "learning_rate": 3.6111111111111115e-06, |
| "loss": 0.0383, |
| "step": 1495 |
| }, |
| { |
| "epoch": 12.786324786324787, |
| "grad_norm": 3.7665908336639404, |
| "learning_rate": 3.606837606837607e-06, |
| "loss": 0.2719, |
| "step": 1496 |
| }, |
| { |
| "epoch": 12.794871794871796, |
| "grad_norm": 2.0319290161132812, |
| "learning_rate": 3.602564102564103e-06, |
| "loss": 0.0741, |
| "step": 1497 |
| }, |
| { |
| "epoch": 12.803418803418804, |
| "grad_norm": 2.3379619121551514, |
| "learning_rate": 3.5982905982905987e-06, |
| "loss": 0.1155, |
| "step": 1498 |
| }, |
| { |
| "epoch": 12.811965811965813, |
| "grad_norm": 5.183985233306885, |
| "learning_rate": 3.594017094017094e-06, |
| "loss": 0.0815, |
| "step": 1499 |
| }, |
| { |
| "epoch": 12.820512820512821, |
| "grad_norm": 3.1432502269744873, |
| "learning_rate": 3.58974358974359e-06, |
| "loss": 0.1855, |
| "step": 1500 |
| }, |
| { |
| "epoch": 12.82905982905983, |
| "grad_norm": 4.5739946365356445, |
| "learning_rate": 3.585470085470086e-06, |
| "loss": 0.1424, |
| "step": 1501 |
| }, |
| { |
| "epoch": 12.837606837606838, |
| "grad_norm": 1.6006520986557007, |
| "learning_rate": 3.5811965811965814e-06, |
| "loss": 0.0305, |
| "step": 1502 |
| }, |
| { |
| "epoch": 12.846153846153847, |
| "grad_norm": 3.937011241912842, |
| "learning_rate": 3.5769230769230773e-06, |
| "loss": 0.2497, |
| "step": 1503 |
| }, |
| { |
| "epoch": 12.854700854700855, |
| "grad_norm": 2.6159651279449463, |
| "learning_rate": 3.572649572649573e-06, |
| "loss": 0.1067, |
| "step": 1504 |
| }, |
| { |
| "epoch": 12.863247863247864, |
| "grad_norm": 2.578547239303589, |
| "learning_rate": 3.5683760683760687e-06, |
| "loss": 0.0663, |
| "step": 1505 |
| }, |
| { |
| "epoch": 12.871794871794872, |
| "grad_norm": 2.3777639865875244, |
| "learning_rate": 3.5641025641025646e-06, |
| "loss": 0.0558, |
| "step": 1506 |
| }, |
| { |
| "epoch": 12.88034188034188, |
| "grad_norm": 7.5656561851501465, |
| "learning_rate": 3.5598290598290604e-06, |
| "loss": 0.2448, |
| "step": 1507 |
| }, |
| { |
| "epoch": 12.88888888888889, |
| "grad_norm": 4.21798849105835, |
| "learning_rate": 3.555555555555556e-06, |
| "loss": 0.1916, |
| "step": 1508 |
| }, |
| { |
| "epoch": 12.897435897435898, |
| "grad_norm": 1.318049669265747, |
| "learning_rate": 3.551282051282052e-06, |
| "loss": 0.0387, |
| "step": 1509 |
| }, |
| { |
| "epoch": 12.905982905982906, |
| "grad_norm": 2.4345362186431885, |
| "learning_rate": 3.5470085470085473e-06, |
| "loss": 0.061, |
| "step": 1510 |
| }, |
| { |
| "epoch": 12.914529914529915, |
| "grad_norm": 3.2767112255096436, |
| "learning_rate": 3.542735042735043e-06, |
| "loss": 0.1627, |
| "step": 1511 |
| }, |
| { |
| "epoch": 12.923076923076923, |
| "grad_norm": 6.881056785583496, |
| "learning_rate": 3.538461538461539e-06, |
| "loss": 0.2452, |
| "step": 1512 |
| }, |
| { |
| "epoch": 12.931623931623932, |
| "grad_norm": 8.017362594604492, |
| "learning_rate": 3.5341880341880345e-06, |
| "loss": 0.1972, |
| "step": 1513 |
| }, |
| { |
| "epoch": 12.94017094017094, |
| "grad_norm": 1.1411398649215698, |
| "learning_rate": 3.5299145299145304e-06, |
| "loss": 0.0243, |
| "step": 1514 |
| }, |
| { |
| "epoch": 12.948717948717949, |
| "grad_norm": 4.486563205718994, |
| "learning_rate": 3.5256410256410263e-06, |
| "loss": 0.1347, |
| "step": 1515 |
| }, |
| { |
| "epoch": 12.957264957264957, |
| "grad_norm": 2.348222494125366, |
| "learning_rate": 3.5213675213675218e-06, |
| "loss": 0.1828, |
| "step": 1516 |
| }, |
| { |
| "epoch": 12.965811965811966, |
| "grad_norm": 2.2855775356292725, |
| "learning_rate": 3.5170940170940177e-06, |
| "loss": 0.0465, |
| "step": 1517 |
| }, |
| { |
| "epoch": 12.974358974358974, |
| "grad_norm": 10.313456535339355, |
| "learning_rate": 3.5128205128205127e-06, |
| "loss": 0.3033, |
| "step": 1518 |
| }, |
| { |
| "epoch": 12.982905982905983, |
| "grad_norm": 12.115890502929688, |
| "learning_rate": 3.5085470085470086e-06, |
| "loss": 0.6762, |
| "step": 1519 |
| }, |
| { |
| "epoch": 12.991452991452991, |
| "grad_norm": 2.746267557144165, |
| "learning_rate": 3.5042735042735045e-06, |
| "loss": 0.123, |
| "step": 1520 |
| }, |
| { |
| "epoch": 13.0, |
| "grad_norm": 5.204991340637207, |
| "learning_rate": 3.5e-06, |
| "loss": 0.2086, |
| "step": 1521 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.06878729909658432, |
| "eval_runtime": 9.2334, |
| "eval_samples_per_second": 50.469, |
| "eval_steps_per_second": 6.39, |
| "step": 1521 |
| }, |
| { |
| "epoch": 13.008547008547009, |
| "grad_norm": 1.8741862773895264, |
| "learning_rate": 3.495726495726496e-06, |
| "loss": 0.0594, |
| "step": 1522 |
| }, |
| { |
| "epoch": 13.017094017094017, |
| "grad_norm": 1.6060154438018799, |
| "learning_rate": 3.4914529914529917e-06, |
| "loss": 0.0426, |
| "step": 1523 |
| }, |
| { |
| "epoch": 13.025641025641026, |
| "grad_norm": 2.194714069366455, |
| "learning_rate": 3.487179487179487e-06, |
| "loss": 0.1907, |
| "step": 1524 |
| }, |
| { |
| "epoch": 13.034188034188034, |
| "grad_norm": 0.716149628162384, |
| "learning_rate": 3.482905982905983e-06, |
| "loss": 0.0177, |
| "step": 1525 |
| }, |
| { |
| "epoch": 13.042735042735043, |
| "grad_norm": 4.787989139556885, |
| "learning_rate": 3.478632478632479e-06, |
| "loss": 0.246, |
| "step": 1526 |
| }, |
| { |
| "epoch": 13.051282051282051, |
| "grad_norm": 1.662338137626648, |
| "learning_rate": 3.4743589743589744e-06, |
| "loss": 0.0561, |
| "step": 1527 |
| }, |
| { |
| "epoch": 13.05982905982906, |
| "grad_norm": 0.9663236737251282, |
| "learning_rate": 3.4700854700854703e-06, |
| "loss": 0.0392, |
| "step": 1528 |
| }, |
| { |
| "epoch": 13.068376068376068, |
| "grad_norm": 0.8232766389846802, |
| "learning_rate": 3.465811965811966e-06, |
| "loss": 0.0221, |
| "step": 1529 |
| }, |
| { |
| "epoch": 13.076923076923077, |
| "grad_norm": 2.434157609939575, |
| "learning_rate": 3.4615384615384617e-06, |
| "loss": 0.1777, |
| "step": 1530 |
| }, |
| { |
| "epoch": 13.085470085470085, |
| "grad_norm": 2.768070936203003, |
| "learning_rate": 3.4572649572649576e-06, |
| "loss": 0.1101, |
| "step": 1531 |
| }, |
| { |
| "epoch": 13.094017094017094, |
| "grad_norm": 2.061371088027954, |
| "learning_rate": 3.452991452991453e-06, |
| "loss": 0.0591, |
| "step": 1532 |
| }, |
| { |
| "epoch": 13.102564102564102, |
| "grad_norm": 1.6127598285675049, |
| "learning_rate": 3.448717948717949e-06, |
| "loss": 0.3858, |
| "step": 1533 |
| }, |
| { |
| "epoch": 13.11111111111111, |
| "grad_norm": 1.2561885118484497, |
| "learning_rate": 3.444444444444445e-06, |
| "loss": 0.0315, |
| "step": 1534 |
| }, |
| { |
| "epoch": 13.11965811965812, |
| "grad_norm": 2.2859408855438232, |
| "learning_rate": 3.4401709401709403e-06, |
| "loss": 0.047, |
| "step": 1535 |
| }, |
| { |
| "epoch": 13.128205128205128, |
| "grad_norm": 3.7528388500213623, |
| "learning_rate": 3.435897435897436e-06, |
| "loss": 0.1069, |
| "step": 1536 |
| }, |
| { |
| "epoch": 13.136752136752136, |
| "grad_norm": 5.547614574432373, |
| "learning_rate": 3.431623931623932e-06, |
| "loss": 0.1411, |
| "step": 1537 |
| }, |
| { |
| "epoch": 13.145299145299145, |
| "grad_norm": 1.6566565036773682, |
| "learning_rate": 3.4273504273504275e-06, |
| "loss": 0.0266, |
| "step": 1538 |
| }, |
| { |
| "epoch": 13.153846153846153, |
| "grad_norm": 5.280163288116455, |
| "learning_rate": 3.4230769230769234e-06, |
| "loss": 0.0843, |
| "step": 1539 |
| }, |
| { |
| "epoch": 13.162393162393162, |
| "grad_norm": 6.624744892120361, |
| "learning_rate": 3.4188034188034193e-06, |
| "loss": 0.1652, |
| "step": 1540 |
| }, |
| { |
| "epoch": 13.17094017094017, |
| "grad_norm": 5.325616359710693, |
| "learning_rate": 3.414529914529915e-06, |
| "loss": 0.077, |
| "step": 1541 |
| }, |
| { |
| "epoch": 13.179487179487179, |
| "grad_norm": 11.31779956817627, |
| "learning_rate": 3.4102564102564107e-06, |
| "loss": 0.4377, |
| "step": 1542 |
| }, |
| { |
| "epoch": 13.188034188034187, |
| "grad_norm": 4.86885404586792, |
| "learning_rate": 3.4059829059829066e-06, |
| "loss": 0.2312, |
| "step": 1543 |
| }, |
| { |
| "epoch": 13.196581196581196, |
| "grad_norm": 1.779068112373352, |
| "learning_rate": 3.401709401709402e-06, |
| "loss": 0.032, |
| "step": 1544 |
| }, |
| { |
| "epoch": 13.205128205128204, |
| "grad_norm": 1.9934108257293701, |
| "learning_rate": 3.397435897435898e-06, |
| "loss": 0.0861, |
| "step": 1545 |
| }, |
| { |
| "epoch": 13.213675213675213, |
| "grad_norm": 2.1829612255096436, |
| "learning_rate": 3.3931623931623934e-06, |
| "loss": 0.0855, |
| "step": 1546 |
| }, |
| { |
| "epoch": 13.222222222222221, |
| "grad_norm": 31.108810424804688, |
| "learning_rate": 3.3888888888888893e-06, |
| "loss": 0.334, |
| "step": 1547 |
| }, |
| { |
| "epoch": 13.23076923076923, |
| "grad_norm": 4.867705345153809, |
| "learning_rate": 3.384615384615385e-06, |
| "loss": 0.0808, |
| "step": 1548 |
| }, |
| { |
| "epoch": 13.239316239316238, |
| "grad_norm": 3.226783275604248, |
| "learning_rate": 3.3803418803418806e-06, |
| "loss": 0.1806, |
| "step": 1549 |
| }, |
| { |
| "epoch": 13.247863247863247, |
| "grad_norm": 1.4822824001312256, |
| "learning_rate": 3.3760683760683765e-06, |
| "loss": 0.0602, |
| "step": 1550 |
| }, |
| { |
| "epoch": 13.256410256410255, |
| "grad_norm": 4.529379844665527, |
| "learning_rate": 3.3717948717948724e-06, |
| "loss": 0.318, |
| "step": 1551 |
| }, |
| { |
| "epoch": 13.264957264957266, |
| "grad_norm": 3.2155706882476807, |
| "learning_rate": 3.367521367521368e-06, |
| "loss": 0.1006, |
| "step": 1552 |
| }, |
| { |
| "epoch": 13.273504273504274, |
| "grad_norm": 2.2805707454681396, |
| "learning_rate": 3.3632478632478638e-06, |
| "loss": 0.0774, |
| "step": 1553 |
| }, |
| { |
| "epoch": 13.282051282051283, |
| "grad_norm": 11.477370262145996, |
| "learning_rate": 3.358974358974359e-06, |
| "loss": 0.8342, |
| "step": 1554 |
| }, |
| { |
| "epoch": 13.290598290598291, |
| "grad_norm": 3.8596534729003906, |
| "learning_rate": 3.3547008547008547e-06, |
| "loss": 0.1924, |
| "step": 1555 |
| }, |
| { |
| "epoch": 13.2991452991453, |
| "grad_norm": 4.497336387634277, |
| "learning_rate": 3.3504273504273506e-06, |
| "loss": 0.2425, |
| "step": 1556 |
| }, |
| { |
| "epoch": 13.307692307692308, |
| "grad_norm": 1.4496978521347046, |
| "learning_rate": 3.346153846153846e-06, |
| "loss": 0.0168, |
| "step": 1557 |
| }, |
| { |
| "epoch": 13.316239316239317, |
| "grad_norm": 2.0277416706085205, |
| "learning_rate": 3.341880341880342e-06, |
| "loss": 0.0634, |
| "step": 1558 |
| }, |
| { |
| "epoch": 13.324786324786325, |
| "grad_norm": 2.9120066165924072, |
| "learning_rate": 3.337606837606838e-06, |
| "loss": 0.1153, |
| "step": 1559 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 4.949625015258789, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.1412, |
| "step": 1560 |
| }, |
| { |
| "epoch": 13.341880341880342, |
| "grad_norm": 5.970853805541992, |
| "learning_rate": 3.3290598290598292e-06, |
| "loss": 0.1607, |
| "step": 1561 |
| }, |
| { |
| "epoch": 13.350427350427351, |
| "grad_norm": 2.1988022327423096, |
| "learning_rate": 3.324786324786325e-06, |
| "loss": 0.0329, |
| "step": 1562 |
| }, |
| { |
| "epoch": 13.35897435897436, |
| "grad_norm": 2.3578758239746094, |
| "learning_rate": 3.3205128205128206e-06, |
| "loss": 0.0711, |
| "step": 1563 |
| }, |
| { |
| "epoch": 13.367521367521368, |
| "grad_norm": 4.554023742675781, |
| "learning_rate": 3.3162393162393165e-06, |
| "loss": 0.1929, |
| "step": 1564 |
| }, |
| { |
| "epoch": 13.376068376068377, |
| "grad_norm": 3.577073335647583, |
| "learning_rate": 3.311965811965812e-06, |
| "loss": 0.0969, |
| "step": 1565 |
| }, |
| { |
| "epoch": 13.384615384615385, |
| "grad_norm": 3.3863015174865723, |
| "learning_rate": 3.307692307692308e-06, |
| "loss": 0.2402, |
| "step": 1566 |
| }, |
| { |
| "epoch": 13.393162393162394, |
| "grad_norm": 1.044550895690918, |
| "learning_rate": 3.3034188034188037e-06, |
| "loss": 0.026, |
| "step": 1567 |
| }, |
| { |
| "epoch": 13.401709401709402, |
| "grad_norm": 3.1525843143463135, |
| "learning_rate": 3.299145299145299e-06, |
| "loss": 0.0619, |
| "step": 1568 |
| }, |
| { |
| "epoch": 13.41025641025641, |
| "grad_norm": 2.0380606651306152, |
| "learning_rate": 3.294871794871795e-06, |
| "loss": 0.0477, |
| "step": 1569 |
| }, |
| { |
| "epoch": 13.418803418803419, |
| "grad_norm": 2.4260973930358887, |
| "learning_rate": 3.290598290598291e-06, |
| "loss": 0.0709, |
| "step": 1570 |
| }, |
| { |
| "epoch": 13.427350427350428, |
| "grad_norm": 20.958803176879883, |
| "learning_rate": 3.2863247863247864e-06, |
| "loss": 0.2297, |
| "step": 1571 |
| }, |
| { |
| "epoch": 13.435897435897436, |
| "grad_norm": 2.847252368927002, |
| "learning_rate": 3.2820512820512823e-06, |
| "loss": 0.0565, |
| "step": 1572 |
| }, |
| { |
| "epoch": 13.444444444444445, |
| "grad_norm": 3.646381139755249, |
| "learning_rate": 3.277777777777778e-06, |
| "loss": 0.3043, |
| "step": 1573 |
| }, |
| { |
| "epoch": 13.452991452991453, |
| "grad_norm": 3.0526609420776367, |
| "learning_rate": 3.2735042735042737e-06, |
| "loss": 0.0941, |
| "step": 1574 |
| }, |
| { |
| "epoch": 13.461538461538462, |
| "grad_norm": 1.6154388189315796, |
| "learning_rate": 3.2692307692307696e-06, |
| "loss": 0.0597, |
| "step": 1575 |
| }, |
| { |
| "epoch": 13.47008547008547, |
| "grad_norm": 1.0825392007827759, |
| "learning_rate": 3.2649572649572655e-06, |
| "loss": 0.0325, |
| "step": 1576 |
| }, |
| { |
| "epoch": 13.478632478632479, |
| "grad_norm": 6.045910358428955, |
| "learning_rate": 3.260683760683761e-06, |
| "loss": 0.2202, |
| "step": 1577 |
| }, |
| { |
| "epoch": 13.487179487179487, |
| "grad_norm": 3.0401153564453125, |
| "learning_rate": 3.256410256410257e-06, |
| "loss": 0.0923, |
| "step": 1578 |
| }, |
| { |
| "epoch": 13.495726495726496, |
| "grad_norm": 5.485551834106445, |
| "learning_rate": 3.2521367521367527e-06, |
| "loss": 0.3851, |
| "step": 1579 |
| }, |
| { |
| "epoch": 13.504273504273504, |
| "grad_norm": 2.575057029724121, |
| "learning_rate": 3.247863247863248e-06, |
| "loss": 0.0307, |
| "step": 1580 |
| }, |
| { |
| "epoch": 13.512820512820513, |
| "grad_norm": 2.7744545936584473, |
| "learning_rate": 3.243589743589744e-06, |
| "loss": 0.1791, |
| "step": 1581 |
| }, |
| { |
| "epoch": 13.521367521367521, |
| "grad_norm": 2.430640459060669, |
| "learning_rate": 3.2393162393162395e-06, |
| "loss": 0.1128, |
| "step": 1582 |
| }, |
| { |
| "epoch": 13.52991452991453, |
| "grad_norm": 4.902276992797852, |
| "learning_rate": 3.2350427350427354e-06, |
| "loss": 0.2661, |
| "step": 1583 |
| }, |
| { |
| "epoch": 13.538461538461538, |
| "grad_norm": 2.601134777069092, |
| "learning_rate": 3.2307692307692313e-06, |
| "loss": 0.1311, |
| "step": 1584 |
| }, |
| { |
| "epoch": 13.547008547008547, |
| "grad_norm": 6.309877395629883, |
| "learning_rate": 3.2264957264957268e-06, |
| "loss": 0.2621, |
| "step": 1585 |
| }, |
| { |
| "epoch": 13.555555555555555, |
| "grad_norm": 2.079618215560913, |
| "learning_rate": 3.2222222222222227e-06, |
| "loss": 0.0702, |
| "step": 1586 |
| }, |
| { |
| "epoch": 13.564102564102564, |
| "grad_norm": 2.309541702270508, |
| "learning_rate": 3.2179487179487186e-06, |
| "loss": 0.1577, |
| "step": 1587 |
| }, |
| { |
| "epoch": 13.572649572649572, |
| "grad_norm": 4.723629951477051, |
| "learning_rate": 3.213675213675214e-06, |
| "loss": 0.142, |
| "step": 1588 |
| }, |
| { |
| "epoch": 13.581196581196581, |
| "grad_norm": 2.557123899459839, |
| "learning_rate": 3.20940170940171e-06, |
| "loss": 0.1506, |
| "step": 1589 |
| }, |
| { |
| "epoch": 13.58974358974359, |
| "grad_norm": 2.3154499530792236, |
| "learning_rate": 3.205128205128206e-06, |
| "loss": 0.1039, |
| "step": 1590 |
| }, |
| { |
| "epoch": 13.598290598290598, |
| "grad_norm": 1.5464012622833252, |
| "learning_rate": 3.200854700854701e-06, |
| "loss": 0.0989, |
| "step": 1591 |
| }, |
| { |
| "epoch": 13.606837606837606, |
| "grad_norm": 1.5885653495788574, |
| "learning_rate": 3.1965811965811967e-06, |
| "loss": 0.0278, |
| "step": 1592 |
| }, |
| { |
| "epoch": 13.615384615384615, |
| "grad_norm": 2.7710390090942383, |
| "learning_rate": 3.192307692307692e-06, |
| "loss": 0.0521, |
| "step": 1593 |
| }, |
| { |
| "epoch": 13.623931623931623, |
| "grad_norm": 4.587305545806885, |
| "learning_rate": 3.188034188034188e-06, |
| "loss": 0.2609, |
| "step": 1594 |
| }, |
| { |
| "epoch": 13.632478632478632, |
| "grad_norm": 4.343963623046875, |
| "learning_rate": 3.183760683760684e-06, |
| "loss": 0.1079, |
| "step": 1595 |
| }, |
| { |
| "epoch": 13.64102564102564, |
| "grad_norm": 2.7653536796569824, |
| "learning_rate": 3.1794871794871795e-06, |
| "loss": 0.1293, |
| "step": 1596 |
| }, |
| { |
| "epoch": 13.649572649572649, |
| "grad_norm": 3.1731350421905518, |
| "learning_rate": 3.1752136752136753e-06, |
| "loss": 0.1279, |
| "step": 1597 |
| }, |
| { |
| "epoch": 13.658119658119658, |
| "grad_norm": 8.032745361328125, |
| "learning_rate": 3.1709401709401712e-06, |
| "loss": 0.2114, |
| "step": 1598 |
| }, |
| { |
| "epoch": 13.666666666666666, |
| "grad_norm": 5.6177263259887695, |
| "learning_rate": 3.1666666666666667e-06, |
| "loss": 0.0926, |
| "step": 1599 |
| }, |
| { |
| "epoch": 13.675213675213675, |
| "grad_norm": 3.3568480014801025, |
| "learning_rate": 3.1623931623931626e-06, |
| "loss": 0.1299, |
| "step": 1600 |
| }, |
| { |
| "epoch": 13.683760683760683, |
| "grad_norm": 5.182860374450684, |
| "learning_rate": 3.158119658119658e-06, |
| "loss": 0.1688, |
| "step": 1601 |
| }, |
| { |
| "epoch": 13.692307692307692, |
| "grad_norm": 5.954287052154541, |
| "learning_rate": 3.153846153846154e-06, |
| "loss": 0.2634, |
| "step": 1602 |
| }, |
| { |
| "epoch": 13.7008547008547, |
| "grad_norm": 2.8563358783721924, |
| "learning_rate": 3.14957264957265e-06, |
| "loss": 0.0469, |
| "step": 1603 |
| }, |
| { |
| "epoch": 13.709401709401709, |
| "grad_norm": 1.6049034595489502, |
| "learning_rate": 3.1452991452991453e-06, |
| "loss": 0.0855, |
| "step": 1604 |
| }, |
| { |
| "epoch": 13.717948717948717, |
| "grad_norm": 1.9734570980072021, |
| "learning_rate": 3.141025641025641e-06, |
| "loss": 0.0554, |
| "step": 1605 |
| }, |
| { |
| "epoch": 13.726495726495726, |
| "grad_norm": 1.8398605585098267, |
| "learning_rate": 3.136752136752137e-06, |
| "loss": 0.1033, |
| "step": 1606 |
| }, |
| { |
| "epoch": 13.735042735042736, |
| "grad_norm": 3.3013346195220947, |
| "learning_rate": 3.1324786324786326e-06, |
| "loss": 0.1476, |
| "step": 1607 |
| }, |
| { |
| "epoch": 13.743589743589745, |
| "grad_norm": 1.2622041702270508, |
| "learning_rate": 3.1282051282051284e-06, |
| "loss": 0.0222, |
| "step": 1608 |
| }, |
| { |
| "epoch": 13.752136752136753, |
| "grad_norm": 3.983888626098633, |
| "learning_rate": 3.1239316239316243e-06, |
| "loss": 0.0861, |
| "step": 1609 |
| }, |
| { |
| "epoch": 13.760683760683762, |
| "grad_norm": 2.883335828781128, |
| "learning_rate": 3.11965811965812e-06, |
| "loss": 0.0737, |
| "step": 1610 |
| }, |
| { |
| "epoch": 13.76923076923077, |
| "grad_norm": 0.9045059680938721, |
| "learning_rate": 3.1153846153846157e-06, |
| "loss": 0.0232, |
| "step": 1611 |
| }, |
| { |
| "epoch": 13.777777777777779, |
| "grad_norm": 1.8752232789993286, |
| "learning_rate": 3.1111111111111116e-06, |
| "loss": 0.0602, |
| "step": 1612 |
| }, |
| { |
| "epoch": 13.786324786324787, |
| "grad_norm": 3.088440418243408, |
| "learning_rate": 3.106837606837607e-06, |
| "loss": 0.102, |
| "step": 1613 |
| }, |
| { |
| "epoch": 13.794871794871796, |
| "grad_norm": 4.067224502563477, |
| "learning_rate": 3.102564102564103e-06, |
| "loss": 0.1461, |
| "step": 1614 |
| }, |
| { |
| "epoch": 13.803418803418804, |
| "grad_norm": 6.9123148918151855, |
| "learning_rate": 3.098290598290599e-06, |
| "loss": 0.0752, |
| "step": 1615 |
| }, |
| { |
| "epoch": 13.811965811965813, |
| "grad_norm": 17.15372657775879, |
| "learning_rate": 3.0940170940170943e-06, |
| "loss": 0.5163, |
| "step": 1616 |
| }, |
| { |
| "epoch": 13.820512820512821, |
| "grad_norm": 2.4951720237731934, |
| "learning_rate": 3.08974358974359e-06, |
| "loss": 0.1326, |
| "step": 1617 |
| }, |
| { |
| "epoch": 13.82905982905983, |
| "grad_norm": 2.1316449642181396, |
| "learning_rate": 3.0854700854700857e-06, |
| "loss": 0.0469, |
| "step": 1618 |
| }, |
| { |
| "epoch": 13.837606837606838, |
| "grad_norm": 2.5955941677093506, |
| "learning_rate": 3.0811965811965815e-06, |
| "loss": 0.1056, |
| "step": 1619 |
| }, |
| { |
| "epoch": 13.846153846153847, |
| "grad_norm": 14.360347747802734, |
| "learning_rate": 3.0769230769230774e-06, |
| "loss": 0.4793, |
| "step": 1620 |
| }, |
| { |
| "epoch": 13.854700854700855, |
| "grad_norm": 1.9134567975997925, |
| "learning_rate": 3.072649572649573e-06, |
| "loss": 0.054, |
| "step": 1621 |
| }, |
| { |
| "epoch": 13.863247863247864, |
| "grad_norm": 3.1168692111968994, |
| "learning_rate": 3.068376068376069e-06, |
| "loss": 0.321, |
| "step": 1622 |
| }, |
| { |
| "epoch": 13.871794871794872, |
| "grad_norm": 4.940008163452148, |
| "learning_rate": 3.0641025641025647e-06, |
| "loss": 0.1452, |
| "step": 1623 |
| }, |
| { |
| "epoch": 13.88034188034188, |
| "grad_norm": 3.001660108566284, |
| "learning_rate": 3.05982905982906e-06, |
| "loss": 0.1094, |
| "step": 1624 |
| }, |
| { |
| "epoch": 13.88888888888889, |
| "grad_norm": 1.3110100030899048, |
| "learning_rate": 3.055555555555556e-06, |
| "loss": 0.0305, |
| "step": 1625 |
| }, |
| { |
| "epoch": 13.897435897435898, |
| "grad_norm": 269.3442077636719, |
| "learning_rate": 3.051282051282052e-06, |
| "loss": 0.8319, |
| "step": 1626 |
| }, |
| { |
| "epoch": 13.905982905982906, |
| "grad_norm": 1.5236955881118774, |
| "learning_rate": 3.0470085470085474e-06, |
| "loss": 0.0294, |
| "step": 1627 |
| }, |
| { |
| "epoch": 13.914529914529915, |
| "grad_norm": 1.8342583179473877, |
| "learning_rate": 3.042735042735043e-06, |
| "loss": 0.1122, |
| "step": 1628 |
| }, |
| { |
| "epoch": 13.923076923076923, |
| "grad_norm": 1.7902953624725342, |
| "learning_rate": 3.0384615384615383e-06, |
| "loss": 0.0426, |
| "step": 1629 |
| }, |
| { |
| "epoch": 13.931623931623932, |
| "grad_norm": 1.461769938468933, |
| "learning_rate": 3.0341880341880342e-06, |
| "loss": 0.0326, |
| "step": 1630 |
| }, |
| { |
| "epoch": 13.94017094017094, |
| "grad_norm": 2.2590038776397705, |
| "learning_rate": 3.02991452991453e-06, |
| "loss": 0.067, |
| "step": 1631 |
| }, |
| { |
| "epoch": 13.948717948717949, |
| "grad_norm": 0.8894402980804443, |
| "learning_rate": 3.0256410256410256e-06, |
| "loss": 0.0269, |
| "step": 1632 |
| }, |
| { |
| "epoch": 13.957264957264957, |
| "grad_norm": 2.097757339477539, |
| "learning_rate": 3.0213675213675215e-06, |
| "loss": 0.1211, |
| "step": 1633 |
| }, |
| { |
| "epoch": 13.965811965811966, |
| "grad_norm": 4.112930774688721, |
| "learning_rate": 3.0170940170940174e-06, |
| "loss": 0.1026, |
| "step": 1634 |
| }, |
| { |
| "epoch": 13.974358974358974, |
| "grad_norm": 4.55318021774292, |
| "learning_rate": 3.012820512820513e-06, |
| "loss": 0.2808, |
| "step": 1635 |
| }, |
| { |
| "epoch": 13.982905982905983, |
| "grad_norm": 2.1912014484405518, |
| "learning_rate": 3.0085470085470087e-06, |
| "loss": 0.0906, |
| "step": 1636 |
| }, |
| { |
| "epoch": 13.991452991452991, |
| "grad_norm": 4.612771511077881, |
| "learning_rate": 3.004273504273504e-06, |
| "loss": 0.17, |
| "step": 1637 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 7.162411212921143, |
| "learning_rate": 3e-06, |
| "loss": 0.131, |
| "step": 1638 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.06268326193094254, |
| "eval_runtime": 9.262, |
| "eval_samples_per_second": 50.313, |
| "eval_steps_per_second": 6.37, |
| "step": 1638 |
| }, |
| { |
| "epoch": 14.008547008547009, |
| "grad_norm": 4.41022253036499, |
| "learning_rate": 2.995726495726496e-06, |
| "loss": 0.1989, |
| "step": 1639 |
| }, |
| { |
| "epoch": 14.017094017094017, |
| "grad_norm": 2.2863216400146484, |
| "learning_rate": 2.9914529914529914e-06, |
| "loss": 0.0612, |
| "step": 1640 |
| }, |
| { |
| "epoch": 14.025641025641026, |
| "grad_norm": 1.5455230474472046, |
| "learning_rate": 2.9871794871794873e-06, |
| "loss": 0.0378, |
| "step": 1641 |
| }, |
| { |
| "epoch": 14.034188034188034, |
| "grad_norm": 0.9546025991439819, |
| "learning_rate": 2.9829059829059832e-06, |
| "loss": 0.0214, |
| "step": 1642 |
| }, |
| { |
| "epoch": 14.042735042735043, |
| "grad_norm": 5.546824932098389, |
| "learning_rate": 2.9786324786324787e-06, |
| "loss": 0.2502, |
| "step": 1643 |
| }, |
| { |
| "epoch": 14.051282051282051, |
| "grad_norm": 1.6261364221572876, |
| "learning_rate": 2.9743589743589746e-06, |
| "loss": 0.0271, |
| "step": 1644 |
| }, |
| { |
| "epoch": 14.05982905982906, |
| "grad_norm": 1.710256814956665, |
| "learning_rate": 2.9700854700854705e-06, |
| "loss": 0.0582, |
| "step": 1645 |
| }, |
| { |
| "epoch": 14.068376068376068, |
| "grad_norm": 1.2083494663238525, |
| "learning_rate": 2.965811965811966e-06, |
| "loss": 0.026, |
| "step": 1646 |
| }, |
| { |
| "epoch": 14.076923076923077, |
| "grad_norm": 3.6400561332702637, |
| "learning_rate": 2.961538461538462e-06, |
| "loss": 0.0896, |
| "step": 1647 |
| }, |
| { |
| "epoch": 14.085470085470085, |
| "grad_norm": 2.1084742546081543, |
| "learning_rate": 2.9572649572649577e-06, |
| "loss": 0.0269, |
| "step": 1648 |
| }, |
| { |
| "epoch": 14.094017094017094, |
| "grad_norm": 1.5661289691925049, |
| "learning_rate": 2.952991452991453e-06, |
| "loss": 0.0401, |
| "step": 1649 |
| }, |
| { |
| "epoch": 14.102564102564102, |
| "grad_norm": 23.358585357666016, |
| "learning_rate": 2.948717948717949e-06, |
| "loss": 0.2069, |
| "step": 1650 |
| }, |
| { |
| "epoch": 14.11111111111111, |
| "grad_norm": 9.171899795532227, |
| "learning_rate": 2.944444444444445e-06, |
| "loss": 0.2842, |
| "step": 1651 |
| }, |
| { |
| "epoch": 14.11965811965812, |
| "grad_norm": 1.3189946413040161, |
| "learning_rate": 2.9401709401709404e-06, |
| "loss": 0.0331, |
| "step": 1652 |
| }, |
| { |
| "epoch": 14.128205128205128, |
| "grad_norm": 3.6144192218780518, |
| "learning_rate": 2.9358974358974363e-06, |
| "loss": 0.2069, |
| "step": 1653 |
| }, |
| { |
| "epoch": 14.136752136752136, |
| "grad_norm": 2.764681577682495, |
| "learning_rate": 2.931623931623932e-06, |
| "loss": 0.0646, |
| "step": 1654 |
| }, |
| { |
| "epoch": 14.145299145299145, |
| "grad_norm": 2.073028564453125, |
| "learning_rate": 2.9273504273504277e-06, |
| "loss": 0.1223, |
| "step": 1655 |
| }, |
| { |
| "epoch": 14.153846153846153, |
| "grad_norm": 12.209549903869629, |
| "learning_rate": 2.9230769230769236e-06, |
| "loss": 0.1922, |
| "step": 1656 |
| }, |
| { |
| "epoch": 14.162393162393162, |
| "grad_norm": 3.1137638092041016, |
| "learning_rate": 2.918803418803419e-06, |
| "loss": 0.2586, |
| "step": 1657 |
| }, |
| { |
| "epoch": 14.17094017094017, |
| "grad_norm": 5.130307674407959, |
| "learning_rate": 2.914529914529915e-06, |
| "loss": 0.2695, |
| "step": 1658 |
| }, |
| { |
| "epoch": 14.179487179487179, |
| "grad_norm": 3.475097894668579, |
| "learning_rate": 2.910256410256411e-06, |
| "loss": 0.2131, |
| "step": 1659 |
| }, |
| { |
| "epoch": 14.188034188034187, |
| "grad_norm": 0.5851498246192932, |
| "learning_rate": 2.9059829059829063e-06, |
| "loss": 0.0167, |
| "step": 1660 |
| }, |
| { |
| "epoch": 14.196581196581196, |
| "grad_norm": 1.795509934425354, |
| "learning_rate": 2.901709401709402e-06, |
| "loss": 0.0857, |
| "step": 1661 |
| }, |
| { |
| "epoch": 14.205128205128204, |
| "grad_norm": 1.7123979330062866, |
| "learning_rate": 2.897435897435898e-06, |
| "loss": 0.0599, |
| "step": 1662 |
| }, |
| { |
| "epoch": 14.213675213675213, |
| "grad_norm": 1.230388879776001, |
| "learning_rate": 2.8931623931623935e-06, |
| "loss": 0.0255, |
| "step": 1663 |
| }, |
| { |
| "epoch": 14.222222222222221, |
| "grad_norm": 3.8747615814208984, |
| "learning_rate": 2.888888888888889e-06, |
| "loss": 0.1412, |
| "step": 1664 |
| }, |
| { |
| "epoch": 14.23076923076923, |
| "grad_norm": 2.233584403991699, |
| "learning_rate": 2.8846153846153845e-06, |
| "loss": 0.068, |
| "step": 1665 |
| }, |
| { |
| "epoch": 14.239316239316238, |
| "grad_norm": 5.327254772186279, |
| "learning_rate": 2.8803418803418804e-06, |
| "loss": 0.2616, |
| "step": 1666 |
| }, |
| { |
| "epoch": 14.247863247863247, |
| "grad_norm": 6.126563549041748, |
| "learning_rate": 2.8760683760683762e-06, |
| "loss": 0.0931, |
| "step": 1667 |
| }, |
| { |
| "epoch": 14.256410256410255, |
| "grad_norm": 1.4305050373077393, |
| "learning_rate": 2.8717948717948717e-06, |
| "loss": 0.0221, |
| "step": 1668 |
| }, |
| { |
| "epoch": 14.264957264957266, |
| "grad_norm": 3.0924506187438965, |
| "learning_rate": 2.8675213675213676e-06, |
| "loss": 0.0417, |
| "step": 1669 |
| }, |
| { |
| "epoch": 14.273504273504274, |
| "grad_norm": 2.548558235168457, |
| "learning_rate": 2.8632478632478635e-06, |
| "loss": 0.0744, |
| "step": 1670 |
| }, |
| { |
| "epoch": 14.282051282051283, |
| "grad_norm": 0.46632057428359985, |
| "learning_rate": 2.858974358974359e-06, |
| "loss": 0.0114, |
| "step": 1671 |
| }, |
| { |
| "epoch": 14.290598290598291, |
| "grad_norm": 2.5199391841888428, |
| "learning_rate": 2.854700854700855e-06, |
| "loss": 0.0819, |
| "step": 1672 |
| }, |
| { |
| "epoch": 14.2991452991453, |
| "grad_norm": 1.849133014678955, |
| "learning_rate": 2.8504273504273507e-06, |
| "loss": 0.0424, |
| "step": 1673 |
| }, |
| { |
| "epoch": 14.307692307692308, |
| "grad_norm": 2.9396777153015137, |
| "learning_rate": 2.846153846153846e-06, |
| "loss": 0.0836, |
| "step": 1674 |
| }, |
| { |
| "epoch": 14.316239316239317, |
| "grad_norm": 0.7128950953483582, |
| "learning_rate": 2.841880341880342e-06, |
| "loss": 0.0181, |
| "step": 1675 |
| }, |
| { |
| "epoch": 14.324786324786325, |
| "grad_norm": 2.1387767791748047, |
| "learning_rate": 2.8376068376068376e-06, |
| "loss": 0.0432, |
| "step": 1676 |
| }, |
| { |
| "epoch": 14.333333333333334, |
| "grad_norm": 7.104556083679199, |
| "learning_rate": 2.8333333333333335e-06, |
| "loss": 0.1277, |
| "step": 1677 |
| }, |
| { |
| "epoch": 14.341880341880342, |
| "grad_norm": 3.718749761581421, |
| "learning_rate": 2.8290598290598293e-06, |
| "loss": 0.0738, |
| "step": 1678 |
| }, |
| { |
| "epoch": 14.350427350427351, |
| "grad_norm": 3.9387831687927246, |
| "learning_rate": 2.824786324786325e-06, |
| "loss": 0.1374, |
| "step": 1679 |
| }, |
| { |
| "epoch": 14.35897435897436, |
| "grad_norm": 2.1527843475341797, |
| "learning_rate": 2.8205128205128207e-06, |
| "loss": 0.1426, |
| "step": 1680 |
| }, |
| { |
| "epoch": 14.367521367521368, |
| "grad_norm": 1.0589011907577515, |
| "learning_rate": 2.8162393162393166e-06, |
| "loss": 0.0343, |
| "step": 1681 |
| }, |
| { |
| "epoch": 14.376068376068377, |
| "grad_norm": 3.55014967918396, |
| "learning_rate": 2.811965811965812e-06, |
| "loss": 0.2962, |
| "step": 1682 |
| }, |
| { |
| "epoch": 14.384615384615385, |
| "grad_norm": 3.996713399887085, |
| "learning_rate": 2.807692307692308e-06, |
| "loss": 0.1458, |
| "step": 1683 |
| }, |
| { |
| "epoch": 14.393162393162394, |
| "grad_norm": 73.28384399414062, |
| "learning_rate": 2.803418803418804e-06, |
| "loss": 0.6138, |
| "step": 1684 |
| }, |
| { |
| "epoch": 14.401709401709402, |
| "grad_norm": 5.780628681182861, |
| "learning_rate": 2.7991452991452993e-06, |
| "loss": 0.2619, |
| "step": 1685 |
| }, |
| { |
| "epoch": 14.41025641025641, |
| "grad_norm": 3.2047317028045654, |
| "learning_rate": 2.794871794871795e-06, |
| "loss": 0.1917, |
| "step": 1686 |
| }, |
| { |
| "epoch": 14.418803418803419, |
| "grad_norm": 7.041647434234619, |
| "learning_rate": 2.790598290598291e-06, |
| "loss": 0.2136, |
| "step": 1687 |
| }, |
| { |
| "epoch": 14.427350427350428, |
| "grad_norm": 3.391404867172241, |
| "learning_rate": 2.7863247863247866e-06, |
| "loss": 0.094, |
| "step": 1688 |
| }, |
| { |
| "epoch": 14.435897435897436, |
| "grad_norm": 0.5430964231491089, |
| "learning_rate": 2.7820512820512824e-06, |
| "loss": 0.0139, |
| "step": 1689 |
| }, |
| { |
| "epoch": 14.444444444444445, |
| "grad_norm": 5.696547985076904, |
| "learning_rate": 2.7777777777777783e-06, |
| "loss": 0.5808, |
| "step": 1690 |
| }, |
| { |
| "epoch": 14.452991452991453, |
| "grad_norm": 3.5785481929779053, |
| "learning_rate": 2.773504273504274e-06, |
| "loss": 0.219, |
| "step": 1691 |
| }, |
| { |
| "epoch": 14.461538461538462, |
| "grad_norm": 6.63624906539917, |
| "learning_rate": 2.7692307692307697e-06, |
| "loss": 0.2586, |
| "step": 1692 |
| }, |
| { |
| "epoch": 14.47008547008547, |
| "grad_norm": 16.79705810546875, |
| "learning_rate": 2.764957264957265e-06, |
| "loss": 0.1762, |
| "step": 1693 |
| }, |
| { |
| "epoch": 14.478632478632479, |
| "grad_norm": 4.069973468780518, |
| "learning_rate": 2.760683760683761e-06, |
| "loss": 0.1191, |
| "step": 1694 |
| }, |
| { |
| "epoch": 14.487179487179487, |
| "grad_norm": 1.1191340684890747, |
| "learning_rate": 2.756410256410257e-06, |
| "loss": 0.0529, |
| "step": 1695 |
| }, |
| { |
| "epoch": 14.495726495726496, |
| "grad_norm": 2.23835825920105, |
| "learning_rate": 2.7521367521367524e-06, |
| "loss": 0.0681, |
| "step": 1696 |
| }, |
| { |
| "epoch": 14.504273504273504, |
| "grad_norm": 2.745694160461426, |
| "learning_rate": 2.7478632478632483e-06, |
| "loss": 0.1885, |
| "step": 1697 |
| }, |
| { |
| "epoch": 14.512820512820513, |
| "grad_norm": 3.642946720123291, |
| "learning_rate": 2.743589743589744e-06, |
| "loss": 0.2061, |
| "step": 1698 |
| }, |
| { |
| "epoch": 14.521367521367521, |
| "grad_norm": 2.7571651935577393, |
| "learning_rate": 2.7393162393162397e-06, |
| "loss": 0.074, |
| "step": 1699 |
| }, |
| { |
| "epoch": 14.52991452991453, |
| "grad_norm": 0.889057457447052, |
| "learning_rate": 2.7350427350427355e-06, |
| "loss": 0.0342, |
| "step": 1700 |
| }, |
| { |
| "epoch": 14.538461538461538, |
| "grad_norm": 0.5471668243408203, |
| "learning_rate": 2.7307692307692306e-06, |
| "loss": 0.0125, |
| "step": 1701 |
| }, |
| { |
| "epoch": 14.547008547008547, |
| "grad_norm": 6.883024215698242, |
| "learning_rate": 2.7264957264957265e-06, |
| "loss": 0.4102, |
| "step": 1702 |
| }, |
| { |
| "epoch": 14.555555555555555, |
| "grad_norm": 2.6678171157836914, |
| "learning_rate": 2.7222222222222224e-06, |
| "loss": 0.0872, |
| "step": 1703 |
| }, |
| { |
| "epoch": 14.564102564102564, |
| "grad_norm": 5.825995445251465, |
| "learning_rate": 2.717948717948718e-06, |
| "loss": 0.1081, |
| "step": 1704 |
| }, |
| { |
| "epoch": 14.572649572649572, |
| "grad_norm": 1.5447179079055786, |
| "learning_rate": 2.7136752136752137e-06, |
| "loss": 0.0838, |
| "step": 1705 |
| }, |
| { |
| "epoch": 14.581196581196581, |
| "grad_norm": 17.58099937438965, |
| "learning_rate": 2.7094017094017096e-06, |
| "loss": 0.6379, |
| "step": 1706 |
| }, |
| { |
| "epoch": 14.58974358974359, |
| "grad_norm": 0.9537908434867859, |
| "learning_rate": 2.705128205128205e-06, |
| "loss": 0.0221, |
| "step": 1707 |
| }, |
| { |
| "epoch": 14.598290598290598, |
| "grad_norm": 3.264037847518921, |
| "learning_rate": 2.700854700854701e-06, |
| "loss": 0.1282, |
| "step": 1708 |
| }, |
| { |
| "epoch": 14.606837606837606, |
| "grad_norm": 1.7752703428268433, |
| "learning_rate": 2.696581196581197e-06, |
| "loss": 0.0194, |
| "step": 1709 |
| }, |
| { |
| "epoch": 14.615384615384615, |
| "grad_norm": 4.8417649269104, |
| "learning_rate": 2.6923076923076923e-06, |
| "loss": 0.2217, |
| "step": 1710 |
| }, |
| { |
| "epoch": 14.623931623931623, |
| "grad_norm": 2.915694236755371, |
| "learning_rate": 2.6880341880341882e-06, |
| "loss": 0.1506, |
| "step": 1711 |
| }, |
| { |
| "epoch": 14.632478632478632, |
| "grad_norm": 10.983115196228027, |
| "learning_rate": 2.6837606837606837e-06, |
| "loss": 0.4307, |
| "step": 1712 |
| }, |
| { |
| "epoch": 14.64102564102564, |
| "grad_norm": 1.1121952533721924, |
| "learning_rate": 2.6794871794871796e-06, |
| "loss": 0.0211, |
| "step": 1713 |
| }, |
| { |
| "epoch": 14.649572649572649, |
| "grad_norm": 2.6676313877105713, |
| "learning_rate": 2.6752136752136755e-06, |
| "loss": 0.0997, |
| "step": 1714 |
| }, |
| { |
| "epoch": 14.658119658119658, |
| "grad_norm": 1.718767523765564, |
| "learning_rate": 2.670940170940171e-06, |
| "loss": 0.0533, |
| "step": 1715 |
| }, |
| { |
| "epoch": 14.666666666666666, |
| "grad_norm": 1.567866563796997, |
| "learning_rate": 2.666666666666667e-06, |
| "loss": 0.0913, |
| "step": 1716 |
| }, |
| { |
| "epoch": 14.675213675213675, |
| "grad_norm": 3.0697431564331055, |
| "learning_rate": 2.6623931623931627e-06, |
| "loss": 0.1133, |
| "step": 1717 |
| }, |
| { |
| "epoch": 14.683760683760683, |
| "grad_norm": 2.2237489223480225, |
| "learning_rate": 2.658119658119658e-06, |
| "loss": 0.1091, |
| "step": 1718 |
| }, |
| { |
| "epoch": 14.692307692307692, |
| "grad_norm": 6.050041198730469, |
| "learning_rate": 2.653846153846154e-06, |
| "loss": 0.5622, |
| "step": 1719 |
| }, |
| { |
| "epoch": 14.7008547008547, |
| "grad_norm": 1.1796153783798218, |
| "learning_rate": 2.64957264957265e-06, |
| "loss": 0.0522, |
| "step": 1720 |
| }, |
| { |
| "epoch": 14.709401709401709, |
| "grad_norm": 2.4849863052368164, |
| "learning_rate": 2.6452991452991454e-06, |
| "loss": 0.0332, |
| "step": 1721 |
| }, |
| { |
| "epoch": 14.717948717948717, |
| "grad_norm": 1.771933674812317, |
| "learning_rate": 2.6410256410256413e-06, |
| "loss": 0.0692, |
| "step": 1722 |
| }, |
| { |
| "epoch": 14.726495726495726, |
| "grad_norm": 4.174441337585449, |
| "learning_rate": 2.6367521367521372e-06, |
| "loss": 0.1419, |
| "step": 1723 |
| }, |
| { |
| "epoch": 14.735042735042736, |
| "grad_norm": 4.145920276641846, |
| "learning_rate": 2.6324786324786327e-06, |
| "loss": 0.5196, |
| "step": 1724 |
| }, |
| { |
| "epoch": 14.743589743589745, |
| "grad_norm": 3.363537073135376, |
| "learning_rate": 2.6282051282051286e-06, |
| "loss": 0.1187, |
| "step": 1725 |
| }, |
| { |
| "epoch": 14.752136752136753, |
| "grad_norm": 1.9558751583099365, |
| "learning_rate": 2.6239316239316245e-06, |
| "loss": 0.0193, |
| "step": 1726 |
| }, |
| { |
| "epoch": 14.760683760683762, |
| "grad_norm": 2.8293466567993164, |
| "learning_rate": 2.61965811965812e-06, |
| "loss": 0.0551, |
| "step": 1727 |
| }, |
| { |
| "epoch": 14.76923076923077, |
| "grad_norm": 1.2654905319213867, |
| "learning_rate": 2.615384615384616e-06, |
| "loss": 0.0805, |
| "step": 1728 |
| }, |
| { |
| "epoch": 14.777777777777779, |
| "grad_norm": 0.9344054460525513, |
| "learning_rate": 2.6111111111111113e-06, |
| "loss": 0.0177, |
| "step": 1729 |
| }, |
| { |
| "epoch": 14.786324786324787, |
| "grad_norm": 1.268433690071106, |
| "learning_rate": 2.606837606837607e-06, |
| "loss": 0.0185, |
| "step": 1730 |
| }, |
| { |
| "epoch": 14.794871794871796, |
| "grad_norm": 2.5544192790985107, |
| "learning_rate": 2.602564102564103e-06, |
| "loss": 0.063, |
| "step": 1731 |
| }, |
| { |
| "epoch": 14.803418803418804, |
| "grad_norm": 2.1078386306762695, |
| "learning_rate": 2.5982905982905985e-06, |
| "loss": 0.1203, |
| "step": 1732 |
| }, |
| { |
| "epoch": 14.811965811965813, |
| "grad_norm": 1.526848554611206, |
| "learning_rate": 2.5940170940170944e-06, |
| "loss": 0.0524, |
| "step": 1733 |
| }, |
| { |
| "epoch": 14.820512820512821, |
| "grad_norm": 0.7479220628738403, |
| "learning_rate": 2.5897435897435903e-06, |
| "loss": 0.0197, |
| "step": 1734 |
| }, |
| { |
| "epoch": 14.82905982905983, |
| "grad_norm": 2.937556266784668, |
| "learning_rate": 2.5854700854700858e-06, |
| "loss": 0.1406, |
| "step": 1735 |
| }, |
| { |
| "epoch": 14.837606837606838, |
| "grad_norm": 2.3128576278686523, |
| "learning_rate": 2.5811965811965817e-06, |
| "loss": 0.056, |
| "step": 1736 |
| }, |
| { |
| "epoch": 14.846153846153847, |
| "grad_norm": 2.1093039512634277, |
| "learning_rate": 2.5769230769230767e-06, |
| "loss": 0.0645, |
| "step": 1737 |
| }, |
| { |
| "epoch": 14.854700854700855, |
| "grad_norm": 2.104214668273926, |
| "learning_rate": 2.5726495726495726e-06, |
| "loss": 0.1097, |
| "step": 1738 |
| }, |
| { |
| "epoch": 14.863247863247864, |
| "grad_norm": 3.781390428543091, |
| "learning_rate": 2.5683760683760685e-06, |
| "loss": 0.1214, |
| "step": 1739 |
| }, |
| { |
| "epoch": 14.871794871794872, |
| "grad_norm": 4.119661331176758, |
| "learning_rate": 2.564102564102564e-06, |
| "loss": 0.1797, |
| "step": 1740 |
| }, |
| { |
| "epoch": 14.88034188034188, |
| "grad_norm": 6.488205909729004, |
| "learning_rate": 2.55982905982906e-06, |
| "loss": 0.0679, |
| "step": 1741 |
| }, |
| { |
| "epoch": 14.88888888888889, |
| "grad_norm": 1.4211604595184326, |
| "learning_rate": 2.5555555555555557e-06, |
| "loss": 0.0375, |
| "step": 1742 |
| }, |
| { |
| "epoch": 14.897435897435898, |
| "grad_norm": 3.577533721923828, |
| "learning_rate": 2.5512820512820512e-06, |
| "loss": 0.1914, |
| "step": 1743 |
| }, |
| { |
| "epoch": 14.905982905982906, |
| "grad_norm": 8.697205543518066, |
| "learning_rate": 2.547008547008547e-06, |
| "loss": 0.5511, |
| "step": 1744 |
| }, |
| { |
| "epoch": 14.914529914529915, |
| "grad_norm": 0.49716269969940186, |
| "learning_rate": 2.542735042735043e-06, |
| "loss": 0.0125, |
| "step": 1745 |
| }, |
| { |
| "epoch": 14.923076923076923, |
| "grad_norm": 2.8563008308410645, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 0.0901, |
| "step": 1746 |
| }, |
| { |
| "epoch": 14.931623931623932, |
| "grad_norm": 3.6407926082611084, |
| "learning_rate": 2.5341880341880344e-06, |
| "loss": 0.0718, |
| "step": 1747 |
| }, |
| { |
| "epoch": 14.94017094017094, |
| "grad_norm": 1.2601441144943237, |
| "learning_rate": 2.52991452991453e-06, |
| "loss": 0.0451, |
| "step": 1748 |
| }, |
| { |
| "epoch": 14.948717948717949, |
| "grad_norm": 2.4402401447296143, |
| "learning_rate": 2.5256410256410257e-06, |
| "loss": 0.0771, |
| "step": 1749 |
| }, |
| { |
| "epoch": 14.957264957264957, |
| "grad_norm": 0.6150484681129456, |
| "learning_rate": 2.5213675213675216e-06, |
| "loss": 0.0151, |
| "step": 1750 |
| }, |
| { |
| "epoch": 14.965811965811966, |
| "grad_norm": 3.6569836139678955, |
| "learning_rate": 2.517094017094017e-06, |
| "loss": 0.0905, |
| "step": 1751 |
| }, |
| { |
| "epoch": 14.974358974358974, |
| "grad_norm": 3.4421300888061523, |
| "learning_rate": 2.512820512820513e-06, |
| "loss": 0.0456, |
| "step": 1752 |
| }, |
| { |
| "epoch": 14.982905982905983, |
| "grad_norm": 3.565871477127075, |
| "learning_rate": 2.508547008547009e-06, |
| "loss": 0.0491, |
| "step": 1753 |
| }, |
| { |
| "epoch": 14.991452991452991, |
| "grad_norm": 37.519065856933594, |
| "learning_rate": 2.5042735042735043e-06, |
| "loss": 0.1348, |
| "step": 1754 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 5.1902899742126465, |
| "learning_rate": 2.5e-06, |
| "loss": 0.1099, |
| "step": 1755 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.05930963531136513, |
| "eval_runtime": 9.2206, |
| "eval_samples_per_second": 50.539, |
| "eval_steps_per_second": 6.399, |
| "step": 1755 |
| }, |
| { |
| "epoch": 15.008547008547009, |
| "grad_norm": 5.6569342613220215, |
| "learning_rate": 2.495726495726496e-06, |
| "loss": 0.1931, |
| "step": 1756 |
| }, |
| { |
| "epoch": 15.017094017094017, |
| "grad_norm": 5.23728084564209, |
| "learning_rate": 2.4914529914529916e-06, |
| "loss": 0.2789, |
| "step": 1757 |
| }, |
| { |
| "epoch": 15.025641025641026, |
| "grad_norm": 0.8648807406425476, |
| "learning_rate": 2.4871794871794875e-06, |
| "loss": 0.0227, |
| "step": 1758 |
| }, |
| { |
| "epoch": 15.034188034188034, |
| "grad_norm": 3.0654587745666504, |
| "learning_rate": 2.4829059829059833e-06, |
| "loss": 0.0602, |
| "step": 1759 |
| }, |
| { |
| "epoch": 15.042735042735043, |
| "grad_norm": 4.374608039855957, |
| "learning_rate": 2.478632478632479e-06, |
| "loss": 0.2133, |
| "step": 1760 |
| }, |
| { |
| "epoch": 15.051282051282051, |
| "grad_norm": 1.2764301300048828, |
| "learning_rate": 2.4743589743589747e-06, |
| "loss": 0.0296, |
| "step": 1761 |
| }, |
| { |
| "epoch": 15.05982905982906, |
| "grad_norm": 0.9672349095344543, |
| "learning_rate": 2.4700854700854706e-06, |
| "loss": 0.0224, |
| "step": 1762 |
| }, |
| { |
| "epoch": 15.068376068376068, |
| "grad_norm": 8.807465553283691, |
| "learning_rate": 2.465811965811966e-06, |
| "loss": 0.0925, |
| "step": 1763 |
| }, |
| { |
| "epoch": 15.076923076923077, |
| "grad_norm": 1.4733474254608154, |
| "learning_rate": 2.461538461538462e-06, |
| "loss": 0.0286, |
| "step": 1764 |
| }, |
| { |
| "epoch": 15.085470085470085, |
| "grad_norm": 6.014289855957031, |
| "learning_rate": 2.4572649572649574e-06, |
| "loss": 0.1387, |
| "step": 1765 |
| }, |
| { |
| "epoch": 15.094017094017094, |
| "grad_norm": 1.899086356163025, |
| "learning_rate": 2.452991452991453e-06, |
| "loss": 0.07, |
| "step": 1766 |
| }, |
| { |
| "epoch": 15.102564102564102, |
| "grad_norm": 11.32197380065918, |
| "learning_rate": 2.4487179487179488e-06, |
| "loss": 0.2452, |
| "step": 1767 |
| }, |
| { |
| "epoch": 15.11111111111111, |
| "grad_norm": 3.223996639251709, |
| "learning_rate": 2.4444444444444447e-06, |
| "loss": 0.139, |
| "step": 1768 |
| }, |
| { |
| "epoch": 15.11965811965812, |
| "grad_norm": 2.8729913234710693, |
| "learning_rate": 2.44017094017094e-06, |
| "loss": 0.1386, |
| "step": 1769 |
| }, |
| { |
| "epoch": 15.128205128205128, |
| "grad_norm": 1.9730579853057861, |
| "learning_rate": 2.435897435897436e-06, |
| "loss": 0.0882, |
| "step": 1770 |
| }, |
| { |
| "epoch": 15.136752136752136, |
| "grad_norm": 5.556413650512695, |
| "learning_rate": 2.431623931623932e-06, |
| "loss": 0.1554, |
| "step": 1771 |
| }, |
| { |
| "epoch": 15.145299145299145, |
| "grad_norm": 1.2356898784637451, |
| "learning_rate": 2.4273504273504274e-06, |
| "loss": 0.0217, |
| "step": 1772 |
| }, |
| { |
| "epoch": 15.153846153846153, |
| "grad_norm": 7.849127769470215, |
| "learning_rate": 2.4230769230769233e-06, |
| "loss": 0.221, |
| "step": 1773 |
| }, |
| { |
| "epoch": 15.162393162393162, |
| "grad_norm": 0.5792569518089294, |
| "learning_rate": 2.418803418803419e-06, |
| "loss": 0.017, |
| "step": 1774 |
| }, |
| { |
| "epoch": 15.17094017094017, |
| "grad_norm": 2.2549376487731934, |
| "learning_rate": 2.4145299145299146e-06, |
| "loss": 0.0499, |
| "step": 1775 |
| }, |
| { |
| "epoch": 15.179487179487179, |
| "grad_norm": 2.722200870513916, |
| "learning_rate": 2.4102564102564105e-06, |
| "loss": 0.0408, |
| "step": 1776 |
| }, |
| { |
| "epoch": 15.188034188034187, |
| "grad_norm": 3.1140944957733154, |
| "learning_rate": 2.4059829059829064e-06, |
| "loss": 0.1001, |
| "step": 1777 |
| }, |
| { |
| "epoch": 15.196581196581196, |
| "grad_norm": 4.461791515350342, |
| "learning_rate": 2.401709401709402e-06, |
| "loss": 0.3419, |
| "step": 1778 |
| }, |
| { |
| "epoch": 15.205128205128204, |
| "grad_norm": 1.8562372922897339, |
| "learning_rate": 2.3974358974358978e-06, |
| "loss": 0.1092, |
| "step": 1779 |
| }, |
| { |
| "epoch": 15.213675213675213, |
| "grad_norm": 5.2086181640625, |
| "learning_rate": 2.3931623931623937e-06, |
| "loss": 0.1767, |
| "step": 1780 |
| }, |
| { |
| "epoch": 15.222222222222221, |
| "grad_norm": 1.6226582527160645, |
| "learning_rate": 2.388888888888889e-06, |
| "loss": 0.0347, |
| "step": 1781 |
| }, |
| { |
| "epoch": 15.23076923076923, |
| "grad_norm": 2.8507306575775146, |
| "learning_rate": 2.384615384615385e-06, |
| "loss": 0.0934, |
| "step": 1782 |
| }, |
| { |
| "epoch": 15.239316239316238, |
| "grad_norm": 2.74642276763916, |
| "learning_rate": 2.3803418803418805e-06, |
| "loss": 0.0857, |
| "step": 1783 |
| }, |
| { |
| "epoch": 15.247863247863247, |
| "grad_norm": 3.4352660179138184, |
| "learning_rate": 2.376068376068376e-06, |
| "loss": 0.2336, |
| "step": 1784 |
| }, |
| { |
| "epoch": 15.256410256410255, |
| "grad_norm": 3.4673473834991455, |
| "learning_rate": 2.371794871794872e-06, |
| "loss": 0.1974, |
| "step": 1785 |
| }, |
| { |
| "epoch": 15.264957264957266, |
| "grad_norm": 21.467744827270508, |
| "learning_rate": 2.3675213675213677e-06, |
| "loss": 0.6836, |
| "step": 1786 |
| }, |
| { |
| "epoch": 15.273504273504274, |
| "grad_norm": 2.832465887069702, |
| "learning_rate": 2.363247863247863e-06, |
| "loss": 0.245, |
| "step": 1787 |
| }, |
| { |
| "epoch": 15.282051282051283, |
| "grad_norm": 9.717825889587402, |
| "learning_rate": 2.358974358974359e-06, |
| "loss": 0.5324, |
| "step": 1788 |
| }, |
| { |
| "epoch": 15.290598290598291, |
| "grad_norm": 2.209528923034668, |
| "learning_rate": 2.354700854700855e-06, |
| "loss": 0.0854, |
| "step": 1789 |
| }, |
| { |
| "epoch": 15.2991452991453, |
| "grad_norm": 4.554971218109131, |
| "learning_rate": 2.3504273504273504e-06, |
| "loss": 0.1271, |
| "step": 1790 |
| }, |
| { |
| "epoch": 15.307692307692308, |
| "grad_norm": 3.1280457973480225, |
| "learning_rate": 2.3461538461538463e-06, |
| "loss": 0.1265, |
| "step": 1791 |
| }, |
| { |
| "epoch": 15.316239316239317, |
| "grad_norm": 2.647224187850952, |
| "learning_rate": 2.3418803418803422e-06, |
| "loss": 0.1965, |
| "step": 1792 |
| }, |
| { |
| "epoch": 15.324786324786325, |
| "grad_norm": 2.7695155143737793, |
| "learning_rate": 2.3376068376068377e-06, |
| "loss": 0.0528, |
| "step": 1793 |
| }, |
| { |
| "epoch": 15.333333333333334, |
| "grad_norm": 20.151025772094727, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 0.2011, |
| "step": 1794 |
| }, |
| { |
| "epoch": 15.341880341880342, |
| "grad_norm": 2.8718080520629883, |
| "learning_rate": 2.3290598290598295e-06, |
| "loss": 0.0502, |
| "step": 1795 |
| }, |
| { |
| "epoch": 15.350427350427351, |
| "grad_norm": 2.17462158203125, |
| "learning_rate": 2.324786324786325e-06, |
| "loss": 0.0658, |
| "step": 1796 |
| }, |
| { |
| "epoch": 15.35897435897436, |
| "grad_norm": 4.324810981750488, |
| "learning_rate": 2.320512820512821e-06, |
| "loss": 0.1429, |
| "step": 1797 |
| }, |
| { |
| "epoch": 15.367521367521368, |
| "grad_norm": 184.52798461914062, |
| "learning_rate": 2.3162393162393167e-06, |
| "loss": 0.5155, |
| "step": 1798 |
| }, |
| { |
| "epoch": 15.376068376068377, |
| "grad_norm": 2.6076488494873047, |
| "learning_rate": 2.311965811965812e-06, |
| "loss": 0.0708, |
| "step": 1799 |
| }, |
| { |
| "epoch": 15.384615384615385, |
| "grad_norm": 3.0682790279388428, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 0.2662, |
| "step": 1800 |
| }, |
| { |
| "epoch": 15.393162393162394, |
| "grad_norm": 1.3366855382919312, |
| "learning_rate": 2.3034188034188035e-06, |
| "loss": 0.0136, |
| "step": 1801 |
| }, |
| { |
| "epoch": 15.401709401709402, |
| "grad_norm": 0.5489670634269714, |
| "learning_rate": 2.299145299145299e-06, |
| "loss": 0.0148, |
| "step": 1802 |
| }, |
| { |
| "epoch": 15.41025641025641, |
| "grad_norm": 1.080804705619812, |
| "learning_rate": 2.294871794871795e-06, |
| "loss": 0.025, |
| "step": 1803 |
| }, |
| { |
| "epoch": 15.418803418803419, |
| "grad_norm": 8.801629066467285, |
| "learning_rate": 2.290598290598291e-06, |
| "loss": 0.2038, |
| "step": 1804 |
| }, |
| { |
| "epoch": 15.427350427350428, |
| "grad_norm": 66.96419525146484, |
| "learning_rate": 2.2863247863247863e-06, |
| "loss": 0.4094, |
| "step": 1805 |
| }, |
| { |
| "epoch": 15.435897435897436, |
| "grad_norm": 1.3400782346725464, |
| "learning_rate": 2.282051282051282e-06, |
| "loss": 0.0452, |
| "step": 1806 |
| }, |
| { |
| "epoch": 15.444444444444445, |
| "grad_norm": 3.5850300788879395, |
| "learning_rate": 2.277777777777778e-06, |
| "loss": 0.0919, |
| "step": 1807 |
| }, |
| { |
| "epoch": 15.452991452991453, |
| "grad_norm": 8.670539855957031, |
| "learning_rate": 2.2735042735042735e-06, |
| "loss": 0.255, |
| "step": 1808 |
| }, |
| { |
| "epoch": 15.461538461538462, |
| "grad_norm": 3.609617233276367, |
| "learning_rate": 2.2692307692307694e-06, |
| "loss": 0.1203, |
| "step": 1809 |
| }, |
| { |
| "epoch": 15.47008547008547, |
| "grad_norm": 1.5857924222946167, |
| "learning_rate": 2.2649572649572653e-06, |
| "loss": 0.0371, |
| "step": 1810 |
| }, |
| { |
| "epoch": 15.478632478632479, |
| "grad_norm": 1.386805534362793, |
| "learning_rate": 2.2606837606837608e-06, |
| "loss": 0.0385, |
| "step": 1811 |
| }, |
| { |
| "epoch": 15.487179487179487, |
| "grad_norm": 4.130802631378174, |
| "learning_rate": 2.2564102564102566e-06, |
| "loss": 0.2261, |
| "step": 1812 |
| }, |
| { |
| "epoch": 15.495726495726496, |
| "grad_norm": 2.974247455596924, |
| "learning_rate": 2.2521367521367525e-06, |
| "loss": 0.0651, |
| "step": 1813 |
| }, |
| { |
| "epoch": 15.504273504273504, |
| "grad_norm": 1.2551554441452026, |
| "learning_rate": 2.247863247863248e-06, |
| "loss": 0.0229, |
| "step": 1814 |
| }, |
| { |
| "epoch": 15.512820512820513, |
| "grad_norm": 3.1401453018188477, |
| "learning_rate": 2.243589743589744e-06, |
| "loss": 0.0409, |
| "step": 1815 |
| }, |
| { |
| "epoch": 15.521367521367521, |
| "grad_norm": 1.3921948671340942, |
| "learning_rate": 2.2393162393162398e-06, |
| "loss": 0.0335, |
| "step": 1816 |
| }, |
| { |
| "epoch": 15.52991452991453, |
| "grad_norm": 5.457981586456299, |
| "learning_rate": 2.2350427350427353e-06, |
| "loss": 0.22, |
| "step": 1817 |
| }, |
| { |
| "epoch": 15.538461538461538, |
| "grad_norm": 0.9100427031517029, |
| "learning_rate": 2.230769230769231e-06, |
| "loss": 0.0217, |
| "step": 1818 |
| }, |
| { |
| "epoch": 15.547008547008547, |
| "grad_norm": 3.5890519618988037, |
| "learning_rate": 2.2264957264957266e-06, |
| "loss": 0.2241, |
| "step": 1819 |
| }, |
| { |
| "epoch": 15.555555555555555, |
| "grad_norm": 2.965954303741455, |
| "learning_rate": 2.222222222222222e-06, |
| "loss": 0.1453, |
| "step": 1820 |
| }, |
| { |
| "epoch": 15.564102564102564, |
| "grad_norm": 8.436135292053223, |
| "learning_rate": 2.217948717948718e-06, |
| "loss": 0.2784, |
| "step": 1821 |
| }, |
| { |
| "epoch": 15.572649572649572, |
| "grad_norm": 2.043687582015991, |
| "learning_rate": 2.213675213675214e-06, |
| "loss": 0.0755, |
| "step": 1822 |
| }, |
| { |
| "epoch": 15.581196581196581, |
| "grad_norm": 2.380276918411255, |
| "learning_rate": 2.2094017094017093e-06, |
| "loss": 0.1867, |
| "step": 1823 |
| }, |
| { |
| "epoch": 15.58974358974359, |
| "grad_norm": 2.5189390182495117, |
| "learning_rate": 2.2051282051282052e-06, |
| "loss": 0.0619, |
| "step": 1824 |
| }, |
| { |
| "epoch": 15.598290598290598, |
| "grad_norm": 1.123610258102417, |
| "learning_rate": 2.200854700854701e-06, |
| "loss": 0.0286, |
| "step": 1825 |
| }, |
| { |
| "epoch": 15.606837606837606, |
| "grad_norm": 3.0018534660339355, |
| "learning_rate": 2.1965811965811966e-06, |
| "loss": 0.1449, |
| "step": 1826 |
| }, |
| { |
| "epoch": 15.615384615384615, |
| "grad_norm": 2.178926706314087, |
| "learning_rate": 2.1923076923076925e-06, |
| "loss": 0.0859, |
| "step": 1827 |
| }, |
| { |
| "epoch": 15.623931623931623, |
| "grad_norm": 5.799438953399658, |
| "learning_rate": 2.1880341880341884e-06, |
| "loss": 0.2669, |
| "step": 1828 |
| }, |
| { |
| "epoch": 15.632478632478632, |
| "grad_norm": 2.0338144302368164, |
| "learning_rate": 2.183760683760684e-06, |
| "loss": 0.0616, |
| "step": 1829 |
| }, |
| { |
| "epoch": 15.64102564102564, |
| "grad_norm": 3.789525032043457, |
| "learning_rate": 2.1794871794871797e-06, |
| "loss": 0.0439, |
| "step": 1830 |
| }, |
| { |
| "epoch": 15.649572649572649, |
| "grad_norm": 2.3695919513702393, |
| "learning_rate": 2.1752136752136756e-06, |
| "loss": 0.0979, |
| "step": 1831 |
| }, |
| { |
| "epoch": 15.658119658119658, |
| "grad_norm": 0.8543546795845032, |
| "learning_rate": 2.170940170940171e-06, |
| "loss": 0.0171, |
| "step": 1832 |
| }, |
| { |
| "epoch": 15.666666666666666, |
| "grad_norm": 3.7921054363250732, |
| "learning_rate": 2.166666666666667e-06, |
| "loss": 0.1094, |
| "step": 1833 |
| }, |
| { |
| "epoch": 15.675213675213675, |
| "grad_norm": 1.9967904090881348, |
| "learning_rate": 2.162393162393163e-06, |
| "loss": 0.0382, |
| "step": 1834 |
| }, |
| { |
| "epoch": 15.683760683760683, |
| "grad_norm": 2.5073959827423096, |
| "learning_rate": 2.1581196581196583e-06, |
| "loss": 0.0554, |
| "step": 1835 |
| }, |
| { |
| "epoch": 15.692307692307692, |
| "grad_norm": 1.2741888761520386, |
| "learning_rate": 2.153846153846154e-06, |
| "loss": 0.056, |
| "step": 1836 |
| }, |
| { |
| "epoch": 15.7008547008547, |
| "grad_norm": 1.992280125617981, |
| "learning_rate": 2.1495726495726497e-06, |
| "loss": 0.0206, |
| "step": 1837 |
| }, |
| { |
| "epoch": 15.709401709401709, |
| "grad_norm": 1.0176990032196045, |
| "learning_rate": 2.145299145299145e-06, |
| "loss": 0.0276, |
| "step": 1838 |
| }, |
| { |
| "epoch": 15.717948717948717, |
| "grad_norm": 1.6685941219329834, |
| "learning_rate": 2.141025641025641e-06, |
| "loss": 0.0222, |
| "step": 1839 |
| }, |
| { |
| "epoch": 15.726495726495726, |
| "grad_norm": 3.171050548553467, |
| "learning_rate": 2.136752136752137e-06, |
| "loss": 0.1526, |
| "step": 1840 |
| }, |
| { |
| "epoch": 15.735042735042736, |
| "grad_norm": 1.5068336725234985, |
| "learning_rate": 2.1324786324786324e-06, |
| "loss": 0.0271, |
| "step": 1841 |
| }, |
| { |
| "epoch": 15.743589743589745, |
| "grad_norm": 3.171870708465576, |
| "learning_rate": 2.1282051282051283e-06, |
| "loss": 0.0628, |
| "step": 1842 |
| }, |
| { |
| "epoch": 15.752136752136753, |
| "grad_norm": 1.9212791919708252, |
| "learning_rate": 2.123931623931624e-06, |
| "loss": 0.1018, |
| "step": 1843 |
| }, |
| { |
| "epoch": 15.760683760683762, |
| "grad_norm": 4.073456287384033, |
| "learning_rate": 2.1196581196581196e-06, |
| "loss": 0.1144, |
| "step": 1844 |
| }, |
| { |
| "epoch": 15.76923076923077, |
| "grad_norm": 1.8453985452651978, |
| "learning_rate": 2.1153846153846155e-06, |
| "loss": 0.0995, |
| "step": 1845 |
| }, |
| { |
| "epoch": 15.777777777777779, |
| "grad_norm": 3.285759210586548, |
| "learning_rate": 2.1111111111111114e-06, |
| "loss": 0.1173, |
| "step": 1846 |
| }, |
| { |
| "epoch": 15.786324786324787, |
| "grad_norm": 3.709202289581299, |
| "learning_rate": 2.106837606837607e-06, |
| "loss": 0.1906, |
| "step": 1847 |
| }, |
| { |
| "epoch": 15.794871794871796, |
| "grad_norm": 1.951262354850769, |
| "learning_rate": 2.1025641025641028e-06, |
| "loss": 0.0954, |
| "step": 1848 |
| }, |
| { |
| "epoch": 15.803418803418804, |
| "grad_norm": 3.249171257019043, |
| "learning_rate": 2.0982905982905987e-06, |
| "loss": 0.1258, |
| "step": 1849 |
| }, |
| { |
| "epoch": 15.811965811965813, |
| "grad_norm": 0.5708752274513245, |
| "learning_rate": 2.094017094017094e-06, |
| "loss": 0.0128, |
| "step": 1850 |
| }, |
| { |
| "epoch": 15.820512820512821, |
| "grad_norm": 3.2894484996795654, |
| "learning_rate": 2.08974358974359e-06, |
| "loss": 0.0621, |
| "step": 1851 |
| }, |
| { |
| "epoch": 15.82905982905983, |
| "grad_norm": 0.8564540147781372, |
| "learning_rate": 2.085470085470086e-06, |
| "loss": 0.0194, |
| "step": 1852 |
| }, |
| { |
| "epoch": 15.837606837606838, |
| "grad_norm": 3.319011926651001, |
| "learning_rate": 2.0811965811965814e-06, |
| "loss": 0.1413, |
| "step": 1853 |
| }, |
| { |
| "epoch": 15.846153846153847, |
| "grad_norm": 1.5385066270828247, |
| "learning_rate": 2.0769230769230773e-06, |
| "loss": 0.0316, |
| "step": 1854 |
| }, |
| { |
| "epoch": 15.854700854700855, |
| "grad_norm": 4.076297283172607, |
| "learning_rate": 2.072649572649573e-06, |
| "loss": 0.2257, |
| "step": 1855 |
| }, |
| { |
| "epoch": 15.863247863247864, |
| "grad_norm": 4.738671779632568, |
| "learning_rate": 2.068376068376068e-06, |
| "loss": 0.1627, |
| "step": 1856 |
| }, |
| { |
| "epoch": 15.871794871794872, |
| "grad_norm": 5.589550495147705, |
| "learning_rate": 2.064102564102564e-06, |
| "loss": 0.3182, |
| "step": 1857 |
| }, |
| { |
| "epoch": 15.88034188034188, |
| "grad_norm": 1.6303757429122925, |
| "learning_rate": 2.05982905982906e-06, |
| "loss": 0.0384, |
| "step": 1858 |
| }, |
| { |
| "epoch": 15.88888888888889, |
| "grad_norm": 3.0257458686828613, |
| "learning_rate": 2.0555555555555555e-06, |
| "loss": 0.0967, |
| "step": 1859 |
| }, |
| { |
| "epoch": 15.897435897435898, |
| "grad_norm": 2.4926559925079346, |
| "learning_rate": 2.0512820512820513e-06, |
| "loss": 0.0703, |
| "step": 1860 |
| }, |
| { |
| "epoch": 15.905982905982906, |
| "grad_norm": 2.0784358978271484, |
| "learning_rate": 2.0470085470085472e-06, |
| "loss": 0.062, |
| "step": 1861 |
| }, |
| { |
| "epoch": 15.914529914529915, |
| "grad_norm": 4.92131233215332, |
| "learning_rate": 2.0427350427350427e-06, |
| "loss": 0.0875, |
| "step": 1862 |
| }, |
| { |
| "epoch": 15.923076923076923, |
| "grad_norm": 2.999511241912842, |
| "learning_rate": 2.0384615384615386e-06, |
| "loss": 0.0388, |
| "step": 1863 |
| }, |
| { |
| "epoch": 15.931623931623932, |
| "grad_norm": 5.770095348358154, |
| "learning_rate": 2.0341880341880345e-06, |
| "loss": 0.1257, |
| "step": 1864 |
| }, |
| { |
| "epoch": 15.94017094017094, |
| "grad_norm": 4.730950832366943, |
| "learning_rate": 2.02991452991453e-06, |
| "loss": 0.2386, |
| "step": 1865 |
| }, |
| { |
| "epoch": 15.948717948717949, |
| "grad_norm": 1.8125661611557007, |
| "learning_rate": 2.025641025641026e-06, |
| "loss": 0.0433, |
| "step": 1866 |
| }, |
| { |
| "epoch": 15.957264957264957, |
| "grad_norm": 5.433501243591309, |
| "learning_rate": 2.0213675213675217e-06, |
| "loss": 0.0536, |
| "step": 1867 |
| }, |
| { |
| "epoch": 15.965811965811966, |
| "grad_norm": 1.2565219402313232, |
| "learning_rate": 2.017094017094017e-06, |
| "loss": 0.0263, |
| "step": 1868 |
| }, |
| { |
| "epoch": 15.974358974358974, |
| "grad_norm": 1.5660192966461182, |
| "learning_rate": 2.012820512820513e-06, |
| "loss": 0.0387, |
| "step": 1869 |
| }, |
| { |
| "epoch": 15.982905982905983, |
| "grad_norm": 5.742929935455322, |
| "learning_rate": 2.008547008547009e-06, |
| "loss": 0.2158, |
| "step": 1870 |
| }, |
| { |
| "epoch": 15.991452991452991, |
| "grad_norm": 3.597506284713745, |
| "learning_rate": 2.0042735042735044e-06, |
| "loss": 0.0962, |
| "step": 1871 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 1.753219485282898, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0193, |
| "step": 1872 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.05589358136057854, |
| "eval_runtime": 9.2203, |
| "eval_samples_per_second": 50.541, |
| "eval_steps_per_second": 6.399, |
| "step": 1872 |
| }, |
| { |
| "epoch": 16.00854700854701, |
| "grad_norm": 9.627431869506836, |
| "learning_rate": 1.9957264957264962e-06, |
| "loss": 0.4748, |
| "step": 1873 |
| }, |
| { |
| "epoch": 16.017094017094017, |
| "grad_norm": 7.770556926727295, |
| "learning_rate": 1.9914529914529917e-06, |
| "loss": 0.2615, |
| "step": 1874 |
| }, |
| { |
| "epoch": 16.025641025641026, |
| "grad_norm": 1.7268822193145752, |
| "learning_rate": 1.987179487179487e-06, |
| "loss": 0.0808, |
| "step": 1875 |
| }, |
| { |
| "epoch": 16.034188034188034, |
| "grad_norm": 1.7209370136260986, |
| "learning_rate": 1.982905982905983e-06, |
| "loss": 0.0575, |
| "step": 1876 |
| }, |
| { |
| "epoch": 16.042735042735043, |
| "grad_norm": 2.6422786712646484, |
| "learning_rate": 1.9786324786324785e-06, |
| "loss": 0.0815, |
| "step": 1877 |
| }, |
| { |
| "epoch": 16.05128205128205, |
| "grad_norm": 0.9057373404502869, |
| "learning_rate": 1.9743589743589744e-06, |
| "loss": 0.0359, |
| "step": 1878 |
| }, |
| { |
| "epoch": 16.05982905982906, |
| "grad_norm": 1.4879076480865479, |
| "learning_rate": 1.9700854700854703e-06, |
| "loss": 0.0658, |
| "step": 1879 |
| }, |
| { |
| "epoch": 16.068376068376068, |
| "grad_norm": 2.1336488723754883, |
| "learning_rate": 1.9658119658119658e-06, |
| "loss": 0.0434, |
| "step": 1880 |
| }, |
| { |
| "epoch": 16.076923076923077, |
| "grad_norm": 2.642249822616577, |
| "learning_rate": 1.9615384615384617e-06, |
| "loss": 0.0768, |
| "step": 1881 |
| }, |
| { |
| "epoch": 16.085470085470085, |
| "grad_norm": 398.1800842285156, |
| "learning_rate": 1.9572649572649575e-06, |
| "loss": 1.7061, |
| "step": 1882 |
| }, |
| { |
| "epoch": 16.094017094017094, |
| "grad_norm": 1.6067556142807007, |
| "learning_rate": 1.952991452991453e-06, |
| "loss": 0.0492, |
| "step": 1883 |
| }, |
| { |
| "epoch": 16.102564102564102, |
| "grad_norm": 45.67499542236328, |
| "learning_rate": 1.948717948717949e-06, |
| "loss": 0.2883, |
| "step": 1884 |
| }, |
| { |
| "epoch": 16.11111111111111, |
| "grad_norm": 5.477624416351318, |
| "learning_rate": 1.944444444444445e-06, |
| "loss": 0.1107, |
| "step": 1885 |
| }, |
| { |
| "epoch": 16.11965811965812, |
| "grad_norm": 2.2795376777648926, |
| "learning_rate": 1.9401709401709403e-06, |
| "loss": 0.0427, |
| "step": 1886 |
| }, |
| { |
| "epoch": 16.128205128205128, |
| "grad_norm": 1.9572805166244507, |
| "learning_rate": 1.935897435897436e-06, |
| "loss": 0.04, |
| "step": 1887 |
| }, |
| { |
| "epoch": 16.136752136752136, |
| "grad_norm": 1.9205402135849, |
| "learning_rate": 1.931623931623932e-06, |
| "loss": 0.0384, |
| "step": 1888 |
| }, |
| { |
| "epoch": 16.145299145299145, |
| "grad_norm": 1.6124738454818726, |
| "learning_rate": 1.9273504273504275e-06, |
| "loss": 0.0322, |
| "step": 1889 |
| }, |
| { |
| "epoch": 16.153846153846153, |
| "grad_norm": 3.3396270275115967, |
| "learning_rate": 1.9230769230769234e-06, |
| "loss": 0.1302, |
| "step": 1890 |
| }, |
| { |
| "epoch": 16.162393162393162, |
| "grad_norm": 2.4800124168395996, |
| "learning_rate": 1.9188034188034193e-06, |
| "loss": 0.1181, |
| "step": 1891 |
| }, |
| { |
| "epoch": 16.17094017094017, |
| "grad_norm": 5.452153205871582, |
| "learning_rate": 1.9145299145299148e-06, |
| "loss": 0.2054, |
| "step": 1892 |
| }, |
| { |
| "epoch": 16.17948717948718, |
| "grad_norm": 4.445066452026367, |
| "learning_rate": 1.9102564102564102e-06, |
| "loss": 0.1649, |
| "step": 1893 |
| }, |
| { |
| "epoch": 16.188034188034187, |
| "grad_norm": 1.0402263402938843, |
| "learning_rate": 1.9059829059829061e-06, |
| "loss": 0.0285, |
| "step": 1894 |
| }, |
| { |
| "epoch": 16.196581196581196, |
| "grad_norm": 1.8124594688415527, |
| "learning_rate": 1.9017094017094018e-06, |
| "loss": 0.0717, |
| "step": 1895 |
| }, |
| { |
| "epoch": 16.205128205128204, |
| "grad_norm": 5.0620245933532715, |
| "learning_rate": 1.8974358974358975e-06, |
| "loss": 0.3833, |
| "step": 1896 |
| }, |
| { |
| "epoch": 16.213675213675213, |
| "grad_norm": 3.201596975326538, |
| "learning_rate": 1.8931623931623931e-06, |
| "loss": 0.0687, |
| "step": 1897 |
| }, |
| { |
| "epoch": 16.22222222222222, |
| "grad_norm": 0.9610732793807983, |
| "learning_rate": 1.888888888888889e-06, |
| "loss": 0.0165, |
| "step": 1898 |
| }, |
| { |
| "epoch": 16.23076923076923, |
| "grad_norm": 1.3409554958343506, |
| "learning_rate": 1.8846153846153847e-06, |
| "loss": 0.024, |
| "step": 1899 |
| }, |
| { |
| "epoch": 16.23931623931624, |
| "grad_norm": 1.2862681150436401, |
| "learning_rate": 1.8803418803418804e-06, |
| "loss": 0.042, |
| "step": 1900 |
| }, |
| { |
| "epoch": 16.247863247863247, |
| "grad_norm": 6.403625011444092, |
| "learning_rate": 1.8760683760683763e-06, |
| "loss": 0.5536, |
| "step": 1901 |
| }, |
| { |
| "epoch": 16.256410256410255, |
| "grad_norm": 3.241731882095337, |
| "learning_rate": 1.871794871794872e-06, |
| "loss": 0.1045, |
| "step": 1902 |
| }, |
| { |
| "epoch": 16.264957264957264, |
| "grad_norm": 1.1206634044647217, |
| "learning_rate": 1.8675213675213676e-06, |
| "loss": 0.0383, |
| "step": 1903 |
| }, |
| { |
| "epoch": 16.273504273504273, |
| "grad_norm": 3.3005762100219727, |
| "learning_rate": 1.8632478632478635e-06, |
| "loss": 0.0786, |
| "step": 1904 |
| }, |
| { |
| "epoch": 16.28205128205128, |
| "grad_norm": 0.44867634773254395, |
| "learning_rate": 1.8589743589743592e-06, |
| "loss": 0.0104, |
| "step": 1905 |
| }, |
| { |
| "epoch": 16.29059829059829, |
| "grad_norm": 2.7023422718048096, |
| "learning_rate": 1.8547008547008549e-06, |
| "loss": 0.1091, |
| "step": 1906 |
| }, |
| { |
| "epoch": 16.299145299145298, |
| "grad_norm": 0.9612734317779541, |
| "learning_rate": 1.8504273504273506e-06, |
| "loss": 0.0165, |
| "step": 1907 |
| }, |
| { |
| "epoch": 16.307692307692307, |
| "grad_norm": 3.0632894039154053, |
| "learning_rate": 1.8461538461538465e-06, |
| "loss": 0.1118, |
| "step": 1908 |
| }, |
| { |
| "epoch": 16.316239316239315, |
| "grad_norm": 3.932769775390625, |
| "learning_rate": 1.8418803418803421e-06, |
| "loss": 0.1084, |
| "step": 1909 |
| }, |
| { |
| "epoch": 16.324786324786324, |
| "grad_norm": 7.795356273651123, |
| "learning_rate": 1.8376068376068378e-06, |
| "loss": 0.2923, |
| "step": 1910 |
| }, |
| { |
| "epoch": 16.333333333333332, |
| "grad_norm": 1.4187766313552856, |
| "learning_rate": 1.8333333333333333e-06, |
| "loss": 0.0408, |
| "step": 1911 |
| }, |
| { |
| "epoch": 16.34188034188034, |
| "grad_norm": 1.1020699739456177, |
| "learning_rate": 1.8290598290598292e-06, |
| "loss": 0.0168, |
| "step": 1912 |
| }, |
| { |
| "epoch": 16.35042735042735, |
| "grad_norm": 0.9890375733375549, |
| "learning_rate": 1.8247863247863249e-06, |
| "loss": 0.0391, |
| "step": 1913 |
| }, |
| { |
| "epoch": 16.358974358974358, |
| "grad_norm": 39.418235778808594, |
| "learning_rate": 1.8205128205128205e-06, |
| "loss": 0.2804, |
| "step": 1914 |
| }, |
| { |
| "epoch": 16.367521367521366, |
| "grad_norm": 1.6613589525222778, |
| "learning_rate": 1.8162393162393164e-06, |
| "loss": 0.0475, |
| "step": 1915 |
| }, |
| { |
| "epoch": 16.376068376068375, |
| "grad_norm": 4.359612464904785, |
| "learning_rate": 1.811965811965812e-06, |
| "loss": 0.2247, |
| "step": 1916 |
| }, |
| { |
| "epoch": 16.384615384615383, |
| "grad_norm": 1.970078706741333, |
| "learning_rate": 1.8076923076923078e-06, |
| "loss": 0.03, |
| "step": 1917 |
| }, |
| { |
| "epoch": 16.39316239316239, |
| "grad_norm": 2.046025037765503, |
| "learning_rate": 1.8034188034188035e-06, |
| "loss": 0.0277, |
| "step": 1918 |
| }, |
| { |
| "epoch": 16.4017094017094, |
| "grad_norm": 1.5775028467178345, |
| "learning_rate": 1.7991452991452994e-06, |
| "loss": 0.0764, |
| "step": 1919 |
| }, |
| { |
| "epoch": 16.41025641025641, |
| "grad_norm": 2.8837273120880127, |
| "learning_rate": 1.794871794871795e-06, |
| "loss": 0.0903, |
| "step": 1920 |
| }, |
| { |
| "epoch": 16.418803418803417, |
| "grad_norm": 7.059972763061523, |
| "learning_rate": 1.7905982905982907e-06, |
| "loss": 0.0679, |
| "step": 1921 |
| }, |
| { |
| "epoch": 16.427350427350426, |
| "grad_norm": 3.6101839542388916, |
| "learning_rate": 1.7863247863247866e-06, |
| "loss": 0.1402, |
| "step": 1922 |
| }, |
| { |
| "epoch": 16.435897435897434, |
| "grad_norm": 2.3459484577178955, |
| "learning_rate": 1.7820512820512823e-06, |
| "loss": 0.0751, |
| "step": 1923 |
| }, |
| { |
| "epoch": 16.444444444444443, |
| "grad_norm": 2.0556280612945557, |
| "learning_rate": 1.777777777777778e-06, |
| "loss": 0.0452, |
| "step": 1924 |
| }, |
| { |
| "epoch": 16.45299145299145, |
| "grad_norm": 0.5339368581771851, |
| "learning_rate": 1.7735042735042736e-06, |
| "loss": 0.013, |
| "step": 1925 |
| }, |
| { |
| "epoch": 16.46153846153846, |
| "grad_norm": 1.393329381942749, |
| "learning_rate": 1.7692307692307695e-06, |
| "loss": 0.038, |
| "step": 1926 |
| }, |
| { |
| "epoch": 16.47008547008547, |
| "grad_norm": 0.9439583420753479, |
| "learning_rate": 1.7649572649572652e-06, |
| "loss": 0.0228, |
| "step": 1927 |
| }, |
| { |
| "epoch": 16.478632478632477, |
| "grad_norm": 3.437713384628296, |
| "learning_rate": 1.7606837606837609e-06, |
| "loss": 0.2072, |
| "step": 1928 |
| }, |
| { |
| "epoch": 16.487179487179485, |
| "grad_norm": 1.725557804107666, |
| "learning_rate": 1.7564102564102563e-06, |
| "loss": 0.0494, |
| "step": 1929 |
| }, |
| { |
| "epoch": 16.495726495726494, |
| "grad_norm": 2.4226529598236084, |
| "learning_rate": 1.7521367521367522e-06, |
| "loss": 0.0796, |
| "step": 1930 |
| }, |
| { |
| "epoch": 16.504273504273506, |
| "grad_norm": 36.0551643371582, |
| "learning_rate": 1.747863247863248e-06, |
| "loss": 0.1966, |
| "step": 1931 |
| }, |
| { |
| "epoch": 16.51282051282051, |
| "grad_norm": 0.8370515704154968, |
| "learning_rate": 1.7435897435897436e-06, |
| "loss": 0.0346, |
| "step": 1932 |
| }, |
| { |
| "epoch": 16.521367521367523, |
| "grad_norm": 2.486854314804077, |
| "learning_rate": 1.7393162393162395e-06, |
| "loss": 0.1423, |
| "step": 1933 |
| }, |
| { |
| "epoch": 16.52991452991453, |
| "grad_norm": 3.2457993030548096, |
| "learning_rate": 1.7350427350427352e-06, |
| "loss": 0.1894, |
| "step": 1934 |
| }, |
| { |
| "epoch": 16.53846153846154, |
| "grad_norm": 2.1744906902313232, |
| "learning_rate": 1.7307692307692308e-06, |
| "loss": 0.0889, |
| "step": 1935 |
| }, |
| { |
| "epoch": 16.54700854700855, |
| "grad_norm": 1.9443250894546509, |
| "learning_rate": 1.7264957264957265e-06, |
| "loss": 0.0413, |
| "step": 1936 |
| }, |
| { |
| "epoch": 16.555555555555557, |
| "grad_norm": 2.0389249324798584, |
| "learning_rate": 1.7222222222222224e-06, |
| "loss": 0.0798, |
| "step": 1937 |
| }, |
| { |
| "epoch": 16.564102564102566, |
| "grad_norm": 4.600223064422607, |
| "learning_rate": 1.717948717948718e-06, |
| "loss": 0.0706, |
| "step": 1938 |
| }, |
| { |
| "epoch": 16.572649572649574, |
| "grad_norm": 1.4231921434402466, |
| "learning_rate": 1.7136752136752138e-06, |
| "loss": 0.0856, |
| "step": 1939 |
| }, |
| { |
| "epoch": 16.581196581196583, |
| "grad_norm": 4.8655290603637695, |
| "learning_rate": 1.7094017094017097e-06, |
| "loss": 0.2519, |
| "step": 1940 |
| }, |
| { |
| "epoch": 16.58974358974359, |
| "grad_norm": 2.6834962368011475, |
| "learning_rate": 1.7051282051282053e-06, |
| "loss": 0.0328, |
| "step": 1941 |
| }, |
| { |
| "epoch": 16.5982905982906, |
| "grad_norm": 0.625557541847229, |
| "learning_rate": 1.700854700854701e-06, |
| "loss": 0.0129, |
| "step": 1942 |
| }, |
| { |
| "epoch": 16.60683760683761, |
| "grad_norm": 10.57834243774414, |
| "learning_rate": 1.6965811965811967e-06, |
| "loss": 0.2987, |
| "step": 1943 |
| }, |
| { |
| "epoch": 16.615384615384617, |
| "grad_norm": 1.2357791662216187, |
| "learning_rate": 1.6923076923076926e-06, |
| "loss": 0.0294, |
| "step": 1944 |
| }, |
| { |
| "epoch": 16.623931623931625, |
| "grad_norm": 1.8380581140518188, |
| "learning_rate": 1.6880341880341883e-06, |
| "loss": 0.0298, |
| "step": 1945 |
| }, |
| { |
| "epoch": 16.632478632478634, |
| "grad_norm": 1.2370020151138306, |
| "learning_rate": 1.683760683760684e-06, |
| "loss": 0.0285, |
| "step": 1946 |
| }, |
| { |
| "epoch": 16.641025641025642, |
| "grad_norm": 5.922267913818359, |
| "learning_rate": 1.6794871794871794e-06, |
| "loss": 0.24, |
| "step": 1947 |
| }, |
| { |
| "epoch": 16.64957264957265, |
| "grad_norm": 2.439023494720459, |
| "learning_rate": 1.6752136752136753e-06, |
| "loss": 0.0988, |
| "step": 1948 |
| }, |
| { |
| "epoch": 16.65811965811966, |
| "grad_norm": 0.8908723592758179, |
| "learning_rate": 1.670940170940171e-06, |
| "loss": 0.026, |
| "step": 1949 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.8728394508361816, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.018, |
| "step": 1950 |
| }, |
| { |
| "epoch": 16.675213675213676, |
| "grad_norm": 2.7304019927978516, |
| "learning_rate": 1.6623931623931626e-06, |
| "loss": 0.1567, |
| "step": 1951 |
| }, |
| { |
| "epoch": 16.683760683760685, |
| "grad_norm": 2.8601150512695312, |
| "learning_rate": 1.6581196581196582e-06, |
| "loss": 0.0721, |
| "step": 1952 |
| }, |
| { |
| "epoch": 16.692307692307693, |
| "grad_norm": 2.5990025997161865, |
| "learning_rate": 1.653846153846154e-06, |
| "loss": 0.2296, |
| "step": 1953 |
| }, |
| { |
| "epoch": 16.700854700854702, |
| "grad_norm": 3.7956109046936035, |
| "learning_rate": 1.6495726495726496e-06, |
| "loss": 0.2565, |
| "step": 1954 |
| }, |
| { |
| "epoch": 16.70940170940171, |
| "grad_norm": 5.933072566986084, |
| "learning_rate": 1.6452991452991455e-06, |
| "loss": 0.2712, |
| "step": 1955 |
| }, |
| { |
| "epoch": 16.71794871794872, |
| "grad_norm": 0.5651862621307373, |
| "learning_rate": 1.6410256410256412e-06, |
| "loss": 0.0132, |
| "step": 1956 |
| }, |
| { |
| "epoch": 16.726495726495727, |
| "grad_norm": 3.033231735229492, |
| "learning_rate": 1.6367521367521368e-06, |
| "loss": 0.074, |
| "step": 1957 |
| }, |
| { |
| "epoch": 16.735042735042736, |
| "grad_norm": 1.3515870571136475, |
| "learning_rate": 1.6324786324786327e-06, |
| "loss": 0.0614, |
| "step": 1958 |
| }, |
| { |
| "epoch": 16.743589743589745, |
| "grad_norm": 3.091700792312622, |
| "learning_rate": 1.6282051282051284e-06, |
| "loss": 0.1284, |
| "step": 1959 |
| }, |
| { |
| "epoch": 16.752136752136753, |
| "grad_norm": 7.142216205596924, |
| "learning_rate": 1.623931623931624e-06, |
| "loss": 0.1965, |
| "step": 1960 |
| }, |
| { |
| "epoch": 16.76068376068376, |
| "grad_norm": 7.488593578338623, |
| "learning_rate": 1.6196581196581198e-06, |
| "loss": 0.2498, |
| "step": 1961 |
| }, |
| { |
| "epoch": 16.76923076923077, |
| "grad_norm": 3.943833351135254, |
| "learning_rate": 1.6153846153846157e-06, |
| "loss": 0.0967, |
| "step": 1962 |
| }, |
| { |
| "epoch": 16.77777777777778, |
| "grad_norm": 1.8732318878173828, |
| "learning_rate": 1.6111111111111113e-06, |
| "loss": 0.029, |
| "step": 1963 |
| }, |
| { |
| "epoch": 16.786324786324787, |
| "grad_norm": 2.5445902347564697, |
| "learning_rate": 1.606837606837607e-06, |
| "loss": 0.0808, |
| "step": 1964 |
| }, |
| { |
| "epoch": 16.794871794871796, |
| "grad_norm": 4.969367504119873, |
| "learning_rate": 1.602564102564103e-06, |
| "loss": 0.164, |
| "step": 1965 |
| }, |
| { |
| "epoch": 16.803418803418804, |
| "grad_norm": 1.6954468488693237, |
| "learning_rate": 1.5982905982905984e-06, |
| "loss": 0.0645, |
| "step": 1966 |
| }, |
| { |
| "epoch": 16.811965811965813, |
| "grad_norm": 1.536352276802063, |
| "learning_rate": 1.594017094017094e-06, |
| "loss": 0.0595, |
| "step": 1967 |
| }, |
| { |
| "epoch": 16.82051282051282, |
| "grad_norm": 0.7326592803001404, |
| "learning_rate": 1.5897435897435897e-06, |
| "loss": 0.0153, |
| "step": 1968 |
| }, |
| { |
| "epoch": 16.82905982905983, |
| "grad_norm": 10.959025382995605, |
| "learning_rate": 1.5854700854700856e-06, |
| "loss": 0.3274, |
| "step": 1969 |
| }, |
| { |
| "epoch": 16.837606837606838, |
| "grad_norm": 10.305845260620117, |
| "learning_rate": 1.5811965811965813e-06, |
| "loss": 0.1404, |
| "step": 1970 |
| }, |
| { |
| "epoch": 16.846153846153847, |
| "grad_norm": 7.498697280883789, |
| "learning_rate": 1.576923076923077e-06, |
| "loss": 0.2269, |
| "step": 1971 |
| }, |
| { |
| "epoch": 16.854700854700855, |
| "grad_norm": 0.29253125190734863, |
| "learning_rate": 1.5726495726495727e-06, |
| "loss": 0.0074, |
| "step": 1972 |
| }, |
| { |
| "epoch": 16.863247863247864, |
| "grad_norm": 9.320234298706055, |
| "learning_rate": 1.5683760683760685e-06, |
| "loss": 0.067, |
| "step": 1973 |
| }, |
| { |
| "epoch": 16.871794871794872, |
| "grad_norm": 6.572272300720215, |
| "learning_rate": 1.5641025641025642e-06, |
| "loss": 0.4577, |
| "step": 1974 |
| }, |
| { |
| "epoch": 16.88034188034188, |
| "grad_norm": 5.368937969207764, |
| "learning_rate": 1.55982905982906e-06, |
| "loss": 0.2016, |
| "step": 1975 |
| }, |
| { |
| "epoch": 16.88888888888889, |
| "grad_norm": 0.5891698598861694, |
| "learning_rate": 1.5555555555555558e-06, |
| "loss": 0.0174, |
| "step": 1976 |
| }, |
| { |
| "epoch": 16.897435897435898, |
| "grad_norm": 3.045989751815796, |
| "learning_rate": 1.5512820512820515e-06, |
| "loss": 0.1748, |
| "step": 1977 |
| }, |
| { |
| "epoch": 16.905982905982906, |
| "grad_norm": 3.013834238052368, |
| "learning_rate": 1.5470085470085471e-06, |
| "loss": 0.2283, |
| "step": 1978 |
| }, |
| { |
| "epoch": 16.914529914529915, |
| "grad_norm": 1.2644447088241577, |
| "learning_rate": 1.5427350427350428e-06, |
| "loss": 0.0302, |
| "step": 1979 |
| }, |
| { |
| "epoch": 16.923076923076923, |
| "grad_norm": 4.429958820343018, |
| "learning_rate": 1.5384615384615387e-06, |
| "loss": 0.2458, |
| "step": 1980 |
| }, |
| { |
| "epoch": 16.931623931623932, |
| "grad_norm": 1.1556981801986694, |
| "learning_rate": 1.5341880341880344e-06, |
| "loss": 0.0179, |
| "step": 1981 |
| }, |
| { |
| "epoch": 16.94017094017094, |
| "grad_norm": 1.4588316679000854, |
| "learning_rate": 1.52991452991453e-06, |
| "loss": 0.1063, |
| "step": 1982 |
| }, |
| { |
| "epoch": 16.94871794871795, |
| "grad_norm": 1.124496340751648, |
| "learning_rate": 1.525641025641026e-06, |
| "loss": 0.0278, |
| "step": 1983 |
| }, |
| { |
| "epoch": 16.957264957264957, |
| "grad_norm": 0.7231981754302979, |
| "learning_rate": 1.5213675213675214e-06, |
| "loss": 0.0141, |
| "step": 1984 |
| }, |
| { |
| "epoch": 16.965811965811966, |
| "grad_norm": 1.4819642305374146, |
| "learning_rate": 1.5170940170940171e-06, |
| "loss": 0.0601, |
| "step": 1985 |
| }, |
| { |
| "epoch": 16.974358974358974, |
| "grad_norm": 0.7296791672706604, |
| "learning_rate": 1.5128205128205128e-06, |
| "loss": 0.0215, |
| "step": 1986 |
| }, |
| { |
| "epoch": 16.982905982905983, |
| "grad_norm": 15.651564598083496, |
| "learning_rate": 1.5085470085470087e-06, |
| "loss": 0.2954, |
| "step": 1987 |
| }, |
| { |
| "epoch": 16.99145299145299, |
| "grad_norm": 0.48891735076904297, |
| "learning_rate": 1.5042735042735044e-06, |
| "loss": 0.015, |
| "step": 1988 |
| }, |
| { |
| "epoch": 17.0, |
| "grad_norm": 7.363093376159668, |
| "learning_rate": 1.5e-06, |
| "loss": 0.2366, |
| "step": 1989 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.05406723916530609, |
| "eval_runtime": 9.389, |
| "eval_samples_per_second": 49.633, |
| "eval_steps_per_second": 6.284, |
| "step": 1989 |
| }, |
| { |
| "epoch": 17.00854700854701, |
| "grad_norm": 2.8626017570495605, |
| "learning_rate": 1.4957264957264957e-06, |
| "loss": 0.0902, |
| "step": 1990 |
| }, |
| { |
| "epoch": 17.017094017094017, |
| "grad_norm": 2.461879253387451, |
| "learning_rate": 1.4914529914529916e-06, |
| "loss": 0.0387, |
| "step": 1991 |
| }, |
| { |
| "epoch": 17.025641025641026, |
| "grad_norm": 6.336863994598389, |
| "learning_rate": 1.4871794871794873e-06, |
| "loss": 0.196, |
| "step": 1992 |
| }, |
| { |
| "epoch": 17.034188034188034, |
| "grad_norm": 1.1044467687606812, |
| "learning_rate": 1.482905982905983e-06, |
| "loss": 0.0352, |
| "step": 1993 |
| }, |
| { |
| "epoch": 17.042735042735043, |
| "grad_norm": 3.3509342670440674, |
| "learning_rate": 1.4786324786324789e-06, |
| "loss": 0.1459, |
| "step": 1994 |
| }, |
| { |
| "epoch": 17.05128205128205, |
| "grad_norm": 3.2349629402160645, |
| "learning_rate": 1.4743589743589745e-06, |
| "loss": 0.0179, |
| "step": 1995 |
| }, |
| { |
| "epoch": 17.05982905982906, |
| "grad_norm": 3.650749921798706, |
| "learning_rate": 1.4700854700854702e-06, |
| "loss": 0.1549, |
| "step": 1996 |
| }, |
| { |
| "epoch": 17.068376068376068, |
| "grad_norm": 1.6349891424179077, |
| "learning_rate": 1.465811965811966e-06, |
| "loss": 0.0713, |
| "step": 1997 |
| }, |
| { |
| "epoch": 17.076923076923077, |
| "grad_norm": 8.602070808410645, |
| "learning_rate": 1.4615384615384618e-06, |
| "loss": 0.3582, |
| "step": 1998 |
| }, |
| { |
| "epoch": 17.085470085470085, |
| "grad_norm": 3.1162590980529785, |
| "learning_rate": 1.4572649572649575e-06, |
| "loss": 0.2455, |
| "step": 1999 |
| }, |
| { |
| "epoch": 17.094017094017094, |
| "grad_norm": 1.4878407716751099, |
| "learning_rate": 1.4529914529914531e-06, |
| "loss": 0.0195, |
| "step": 2000 |
| }, |
| { |
| "epoch": 17.102564102564102, |
| "grad_norm": 2.565297842025757, |
| "learning_rate": 1.448717948717949e-06, |
| "loss": 0.1126, |
| "step": 2001 |
| }, |
| { |
| "epoch": 17.11111111111111, |
| "grad_norm": 4.169450759887695, |
| "learning_rate": 1.4444444444444445e-06, |
| "loss": 0.1774, |
| "step": 2002 |
| }, |
| { |
| "epoch": 17.11965811965812, |
| "grad_norm": 1.8476792573928833, |
| "learning_rate": 1.4401709401709402e-06, |
| "loss": 0.0288, |
| "step": 2003 |
| }, |
| { |
| "epoch": 17.128205128205128, |
| "grad_norm": 0.7279506921768188, |
| "learning_rate": 1.4358974358974359e-06, |
| "loss": 0.0217, |
| "step": 2004 |
| }, |
| { |
| "epoch": 17.136752136752136, |
| "grad_norm": 7.387227535247803, |
| "learning_rate": 1.4316239316239317e-06, |
| "loss": 0.248, |
| "step": 2005 |
| }, |
| { |
| "epoch": 17.145299145299145, |
| "grad_norm": 2.9455361366271973, |
| "learning_rate": 1.4273504273504274e-06, |
| "loss": 0.0439, |
| "step": 2006 |
| }, |
| { |
| "epoch": 17.153846153846153, |
| "grad_norm": 6.015694618225098, |
| "learning_rate": 1.423076923076923e-06, |
| "loss": 0.0656, |
| "step": 2007 |
| }, |
| { |
| "epoch": 17.162393162393162, |
| "grad_norm": 1.741774320602417, |
| "learning_rate": 1.4188034188034188e-06, |
| "loss": 0.0344, |
| "step": 2008 |
| }, |
| { |
| "epoch": 17.17094017094017, |
| "grad_norm": 0.5282659530639648, |
| "learning_rate": 1.4145299145299147e-06, |
| "loss": 0.0128, |
| "step": 2009 |
| }, |
| { |
| "epoch": 17.17948717948718, |
| "grad_norm": 2.4927468299865723, |
| "learning_rate": 1.4102564102564104e-06, |
| "loss": 0.1839, |
| "step": 2010 |
| }, |
| { |
| "epoch": 17.188034188034187, |
| "grad_norm": 0.7872166037559509, |
| "learning_rate": 1.405982905982906e-06, |
| "loss": 0.0204, |
| "step": 2011 |
| }, |
| { |
| "epoch": 17.196581196581196, |
| "grad_norm": 0.7072253227233887, |
| "learning_rate": 1.401709401709402e-06, |
| "loss": 0.0206, |
| "step": 2012 |
| }, |
| { |
| "epoch": 17.205128205128204, |
| "grad_norm": 1.0154236555099487, |
| "learning_rate": 1.3974358974358976e-06, |
| "loss": 0.0238, |
| "step": 2013 |
| }, |
| { |
| "epoch": 17.213675213675213, |
| "grad_norm": 2.9798424243927, |
| "learning_rate": 1.3931623931623933e-06, |
| "loss": 0.0542, |
| "step": 2014 |
| }, |
| { |
| "epoch": 17.22222222222222, |
| "grad_norm": 0.9568426012992859, |
| "learning_rate": 1.3888888888888892e-06, |
| "loss": 0.0239, |
| "step": 2015 |
| }, |
| { |
| "epoch": 17.23076923076923, |
| "grad_norm": 10.525039672851562, |
| "learning_rate": 1.3846153846153848e-06, |
| "loss": 0.1768, |
| "step": 2016 |
| }, |
| { |
| "epoch": 17.23931623931624, |
| "grad_norm": 1.697314977645874, |
| "learning_rate": 1.3803418803418805e-06, |
| "loss": 0.0453, |
| "step": 2017 |
| }, |
| { |
| "epoch": 17.247863247863247, |
| "grad_norm": 0.6436419486999512, |
| "learning_rate": 1.3760683760683762e-06, |
| "loss": 0.0163, |
| "step": 2018 |
| }, |
| { |
| "epoch": 17.256410256410255, |
| "grad_norm": 4.984555721282959, |
| "learning_rate": 1.371794871794872e-06, |
| "loss": 0.1157, |
| "step": 2019 |
| }, |
| { |
| "epoch": 17.264957264957264, |
| "grad_norm": 9.088909149169922, |
| "learning_rate": 1.3675213675213678e-06, |
| "loss": 0.2842, |
| "step": 2020 |
| }, |
| { |
| "epoch": 17.273504273504273, |
| "grad_norm": 10.398246765136719, |
| "learning_rate": 1.3632478632478632e-06, |
| "loss": 0.2528, |
| "step": 2021 |
| }, |
| { |
| "epoch": 17.28205128205128, |
| "grad_norm": 3.60273814201355, |
| "learning_rate": 1.358974358974359e-06, |
| "loss": 0.1799, |
| "step": 2022 |
| }, |
| { |
| "epoch": 17.29059829059829, |
| "grad_norm": 0.6845250129699707, |
| "learning_rate": 1.3547008547008548e-06, |
| "loss": 0.0196, |
| "step": 2023 |
| }, |
| { |
| "epoch": 17.299145299145298, |
| "grad_norm": 0.5363795161247253, |
| "learning_rate": 1.3504273504273505e-06, |
| "loss": 0.0136, |
| "step": 2024 |
| }, |
| { |
| "epoch": 17.307692307692307, |
| "grad_norm": 3.880434274673462, |
| "learning_rate": 1.3461538461538462e-06, |
| "loss": 0.3665, |
| "step": 2025 |
| }, |
| { |
| "epoch": 17.316239316239315, |
| "grad_norm": 4.580989360809326, |
| "learning_rate": 1.3418803418803418e-06, |
| "loss": 0.2593, |
| "step": 2026 |
| }, |
| { |
| "epoch": 17.324786324786324, |
| "grad_norm": 2.781501293182373, |
| "learning_rate": 1.3376068376068377e-06, |
| "loss": 0.1777, |
| "step": 2027 |
| }, |
| { |
| "epoch": 17.333333333333332, |
| "grad_norm": 5.605004787445068, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 0.3633, |
| "step": 2028 |
| }, |
| { |
| "epoch": 17.34188034188034, |
| "grad_norm": 1.696486473083496, |
| "learning_rate": 1.329059829059829e-06, |
| "loss": 0.0353, |
| "step": 2029 |
| }, |
| { |
| "epoch": 17.35042735042735, |
| "grad_norm": 3.4415268898010254, |
| "learning_rate": 1.324786324786325e-06, |
| "loss": 0.0906, |
| "step": 2030 |
| }, |
| { |
| "epoch": 17.358974358974358, |
| "grad_norm": 7.722592353820801, |
| "learning_rate": 1.3205128205128207e-06, |
| "loss": 0.1804, |
| "step": 2031 |
| }, |
| { |
| "epoch": 17.367521367521366, |
| "grad_norm": 3.3161542415618896, |
| "learning_rate": 1.3162393162393163e-06, |
| "loss": 0.1336, |
| "step": 2032 |
| }, |
| { |
| "epoch": 17.376068376068375, |
| "grad_norm": 2.568871021270752, |
| "learning_rate": 1.3119658119658122e-06, |
| "loss": 0.0658, |
| "step": 2033 |
| }, |
| { |
| "epoch": 17.384615384615383, |
| "grad_norm": 3.5799806118011475, |
| "learning_rate": 1.307692307692308e-06, |
| "loss": 0.0652, |
| "step": 2034 |
| }, |
| { |
| "epoch": 17.39316239316239, |
| "grad_norm": 1.1399949789047241, |
| "learning_rate": 1.3034188034188036e-06, |
| "loss": 0.0196, |
| "step": 2035 |
| }, |
| { |
| "epoch": 17.4017094017094, |
| "grad_norm": 2.3688738346099854, |
| "learning_rate": 1.2991452991452993e-06, |
| "loss": 0.0706, |
| "step": 2036 |
| }, |
| { |
| "epoch": 17.41025641025641, |
| "grad_norm": 12.726486206054688, |
| "learning_rate": 1.2948717948717952e-06, |
| "loss": 0.2506, |
| "step": 2037 |
| }, |
| { |
| "epoch": 17.418803418803417, |
| "grad_norm": 2.249285936355591, |
| "learning_rate": 1.2905982905982908e-06, |
| "loss": 0.0532, |
| "step": 2038 |
| }, |
| { |
| "epoch": 17.427350427350426, |
| "grad_norm": 0.7129601836204529, |
| "learning_rate": 1.2863247863247863e-06, |
| "loss": 0.0207, |
| "step": 2039 |
| }, |
| { |
| "epoch": 17.435897435897434, |
| "grad_norm": 1.9362183809280396, |
| "learning_rate": 1.282051282051282e-06, |
| "loss": 0.0311, |
| "step": 2040 |
| }, |
| { |
| "epoch": 17.444444444444443, |
| "grad_norm": 2.253690242767334, |
| "learning_rate": 1.2777777777777779e-06, |
| "loss": 0.1203, |
| "step": 2041 |
| }, |
| { |
| "epoch": 17.45299145299145, |
| "grad_norm": 3.835174798965454, |
| "learning_rate": 1.2735042735042736e-06, |
| "loss": 0.0928, |
| "step": 2042 |
| }, |
| { |
| "epoch": 17.46153846153846, |
| "grad_norm": 143.36563110351562, |
| "learning_rate": 1.2692307692307692e-06, |
| "loss": 0.2984, |
| "step": 2043 |
| }, |
| { |
| "epoch": 17.47008547008547, |
| "grad_norm": 0.6122754216194153, |
| "learning_rate": 1.264957264957265e-06, |
| "loss": 0.0171, |
| "step": 2044 |
| }, |
| { |
| "epoch": 17.478632478632477, |
| "grad_norm": 3.0697991847991943, |
| "learning_rate": 1.2606837606837608e-06, |
| "loss": 0.1412, |
| "step": 2045 |
| }, |
| { |
| "epoch": 17.487179487179485, |
| "grad_norm": 1.0684096813201904, |
| "learning_rate": 1.2564102564102565e-06, |
| "loss": 0.0278, |
| "step": 2046 |
| }, |
| { |
| "epoch": 17.495726495726494, |
| "grad_norm": 5.379480838775635, |
| "learning_rate": 1.2521367521367522e-06, |
| "loss": 0.1114, |
| "step": 2047 |
| }, |
| { |
| "epoch": 17.504273504273506, |
| "grad_norm": 3.893343448638916, |
| "learning_rate": 1.247863247863248e-06, |
| "loss": 0.1499, |
| "step": 2048 |
| }, |
| { |
| "epoch": 17.51282051282051, |
| "grad_norm": 1.0436211824417114, |
| "learning_rate": 1.2435897435897437e-06, |
| "loss": 0.0259, |
| "step": 2049 |
| }, |
| { |
| "epoch": 17.521367521367523, |
| "grad_norm": 2.8706037998199463, |
| "learning_rate": 1.2393162393162394e-06, |
| "loss": 0.1071, |
| "step": 2050 |
| }, |
| { |
| "epoch": 17.52991452991453, |
| "grad_norm": 1.5661158561706543, |
| "learning_rate": 1.2350427350427353e-06, |
| "loss": 0.0392, |
| "step": 2051 |
| }, |
| { |
| "epoch": 17.53846153846154, |
| "grad_norm": 3.7152199745178223, |
| "learning_rate": 1.230769230769231e-06, |
| "loss": 0.0698, |
| "step": 2052 |
| }, |
| { |
| "epoch": 17.54700854700855, |
| "grad_norm": 2.6527271270751953, |
| "learning_rate": 1.2264957264957264e-06, |
| "loss": 0.1276, |
| "step": 2053 |
| }, |
| { |
| "epoch": 17.555555555555557, |
| "grad_norm": 0.9018534421920776, |
| "learning_rate": 1.2222222222222223e-06, |
| "loss": 0.066, |
| "step": 2054 |
| }, |
| { |
| "epoch": 17.564102564102566, |
| "grad_norm": 7.11035680770874, |
| "learning_rate": 1.217948717948718e-06, |
| "loss": 0.0836, |
| "step": 2055 |
| }, |
| { |
| "epoch": 17.572649572649574, |
| "grad_norm": 2.5168066024780273, |
| "learning_rate": 1.2136752136752137e-06, |
| "loss": 0.0662, |
| "step": 2056 |
| }, |
| { |
| "epoch": 17.581196581196583, |
| "grad_norm": 0.7215616703033447, |
| "learning_rate": 1.2094017094017096e-06, |
| "loss": 0.0186, |
| "step": 2057 |
| }, |
| { |
| "epoch": 17.58974358974359, |
| "grad_norm": 7.076876640319824, |
| "learning_rate": 1.2051282051282053e-06, |
| "loss": 0.1493, |
| "step": 2058 |
| }, |
| { |
| "epoch": 17.5982905982906, |
| "grad_norm": 1.1687662601470947, |
| "learning_rate": 1.200854700854701e-06, |
| "loss": 0.0368, |
| "step": 2059 |
| }, |
| { |
| "epoch": 17.60683760683761, |
| "grad_norm": 2.5085737705230713, |
| "learning_rate": 1.1965811965811968e-06, |
| "loss": 0.1567, |
| "step": 2060 |
| }, |
| { |
| "epoch": 17.615384615384617, |
| "grad_norm": 0.43566644191741943, |
| "learning_rate": 1.1923076923076925e-06, |
| "loss": 0.0097, |
| "step": 2061 |
| }, |
| { |
| "epoch": 17.623931623931625, |
| "grad_norm": 0.7698078155517578, |
| "learning_rate": 1.188034188034188e-06, |
| "loss": 0.0231, |
| "step": 2062 |
| }, |
| { |
| "epoch": 17.632478632478634, |
| "grad_norm": 1.8352185487747192, |
| "learning_rate": 1.1837606837606839e-06, |
| "loss": 0.0324, |
| "step": 2063 |
| }, |
| { |
| "epoch": 17.641025641025642, |
| "grad_norm": 12.11907958984375, |
| "learning_rate": 1.1794871794871795e-06, |
| "loss": 0.6052, |
| "step": 2064 |
| }, |
| { |
| "epoch": 17.64957264957265, |
| "grad_norm": 0.49942728877067566, |
| "learning_rate": 1.1752136752136752e-06, |
| "loss": 0.0111, |
| "step": 2065 |
| }, |
| { |
| "epoch": 17.65811965811966, |
| "grad_norm": 3.579129457473755, |
| "learning_rate": 1.1709401709401711e-06, |
| "loss": 0.1706, |
| "step": 2066 |
| }, |
| { |
| "epoch": 17.666666666666668, |
| "grad_norm": 2.112550973892212, |
| "learning_rate": 1.1666666666666668e-06, |
| "loss": 0.0438, |
| "step": 2067 |
| }, |
| { |
| "epoch": 17.675213675213676, |
| "grad_norm": 2.4429895877838135, |
| "learning_rate": 1.1623931623931625e-06, |
| "loss": 0.0498, |
| "step": 2068 |
| }, |
| { |
| "epoch": 17.683760683760685, |
| "grad_norm": 1.8436684608459473, |
| "learning_rate": 1.1581196581196584e-06, |
| "loss": 0.1228, |
| "step": 2069 |
| }, |
| { |
| "epoch": 17.692307692307693, |
| "grad_norm": 4.679569244384766, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 0.1505, |
| "step": 2070 |
| }, |
| { |
| "epoch": 17.700854700854702, |
| "grad_norm": 2.4409713745117188, |
| "learning_rate": 1.1495726495726495e-06, |
| "loss": 0.0603, |
| "step": 2071 |
| }, |
| { |
| "epoch": 17.70940170940171, |
| "grad_norm": 3.577721118927002, |
| "learning_rate": 1.1452991452991454e-06, |
| "loss": 0.1078, |
| "step": 2072 |
| }, |
| { |
| "epoch": 17.71794871794872, |
| "grad_norm": 3.774958372116089, |
| "learning_rate": 1.141025641025641e-06, |
| "loss": 0.3782, |
| "step": 2073 |
| }, |
| { |
| "epoch": 17.726495726495727, |
| "grad_norm": 2.9011383056640625, |
| "learning_rate": 1.1367521367521368e-06, |
| "loss": 0.0714, |
| "step": 2074 |
| }, |
| { |
| "epoch": 17.735042735042736, |
| "grad_norm": 1.7296162843704224, |
| "learning_rate": 1.1324786324786326e-06, |
| "loss": 0.0463, |
| "step": 2075 |
| }, |
| { |
| "epoch": 17.743589743589745, |
| "grad_norm": 1.8955838680267334, |
| "learning_rate": 1.1282051282051283e-06, |
| "loss": 0.0641, |
| "step": 2076 |
| }, |
| { |
| "epoch": 17.752136752136753, |
| "grad_norm": 3.0198490619659424, |
| "learning_rate": 1.123931623931624e-06, |
| "loss": 0.1516, |
| "step": 2077 |
| }, |
| { |
| "epoch": 17.76068376068376, |
| "grad_norm": 1.5012823343276978, |
| "learning_rate": 1.1196581196581199e-06, |
| "loss": 0.0206, |
| "step": 2078 |
| }, |
| { |
| "epoch": 17.76923076923077, |
| "grad_norm": 2.4390790462493896, |
| "learning_rate": 1.1153846153846156e-06, |
| "loss": 0.0458, |
| "step": 2079 |
| }, |
| { |
| "epoch": 17.77777777777778, |
| "grad_norm": 5.728135585784912, |
| "learning_rate": 1.111111111111111e-06, |
| "loss": 0.0443, |
| "step": 2080 |
| }, |
| { |
| "epoch": 17.786324786324787, |
| "grad_norm": 1.423771858215332, |
| "learning_rate": 1.106837606837607e-06, |
| "loss": 0.0223, |
| "step": 2081 |
| }, |
| { |
| "epoch": 17.794871794871796, |
| "grad_norm": 2.524941921234131, |
| "learning_rate": 1.1025641025641026e-06, |
| "loss": 0.0587, |
| "step": 2082 |
| }, |
| { |
| "epoch": 17.803418803418804, |
| "grad_norm": 0.9632331132888794, |
| "learning_rate": 1.0982905982905983e-06, |
| "loss": 0.0324, |
| "step": 2083 |
| }, |
| { |
| "epoch": 17.811965811965813, |
| "grad_norm": 1.8369181156158447, |
| "learning_rate": 1.0940170940170942e-06, |
| "loss": 0.0182, |
| "step": 2084 |
| }, |
| { |
| "epoch": 17.82051282051282, |
| "grad_norm": 2.547654867172241, |
| "learning_rate": 1.0897435897435899e-06, |
| "loss": 0.1395, |
| "step": 2085 |
| }, |
| { |
| "epoch": 17.82905982905983, |
| "grad_norm": 3.516977310180664, |
| "learning_rate": 1.0854700854700855e-06, |
| "loss": 0.1044, |
| "step": 2086 |
| }, |
| { |
| "epoch": 17.837606837606838, |
| "grad_norm": 1.7064217329025269, |
| "learning_rate": 1.0811965811965814e-06, |
| "loss": 0.0302, |
| "step": 2087 |
| }, |
| { |
| "epoch": 17.846153846153847, |
| "grad_norm": 1.7427505254745483, |
| "learning_rate": 1.076923076923077e-06, |
| "loss": 0.0298, |
| "step": 2088 |
| }, |
| { |
| "epoch": 17.854700854700855, |
| "grad_norm": 1.3395370244979858, |
| "learning_rate": 1.0726495726495726e-06, |
| "loss": 0.0302, |
| "step": 2089 |
| }, |
| { |
| "epoch": 17.863247863247864, |
| "grad_norm": 7.244344711303711, |
| "learning_rate": 1.0683760683760685e-06, |
| "loss": 0.1925, |
| "step": 2090 |
| }, |
| { |
| "epoch": 17.871794871794872, |
| "grad_norm": 5.942878723144531, |
| "learning_rate": 1.0641025641025641e-06, |
| "loss": 0.489, |
| "step": 2091 |
| }, |
| { |
| "epoch": 17.88034188034188, |
| "grad_norm": 3.244260787963867, |
| "learning_rate": 1.0598290598290598e-06, |
| "loss": 0.2538, |
| "step": 2092 |
| }, |
| { |
| "epoch": 17.88888888888889, |
| "grad_norm": 0.9833334684371948, |
| "learning_rate": 1.0555555555555557e-06, |
| "loss": 0.0215, |
| "step": 2093 |
| }, |
| { |
| "epoch": 17.897435897435898, |
| "grad_norm": 3.0194849967956543, |
| "learning_rate": 1.0512820512820514e-06, |
| "loss": 0.07, |
| "step": 2094 |
| }, |
| { |
| "epoch": 17.905982905982906, |
| "grad_norm": 0.48535388708114624, |
| "learning_rate": 1.047008547008547e-06, |
| "loss": 0.0113, |
| "step": 2095 |
| }, |
| { |
| "epoch": 17.914529914529915, |
| "grad_norm": 4.334452152252197, |
| "learning_rate": 1.042735042735043e-06, |
| "loss": 0.127, |
| "step": 2096 |
| }, |
| { |
| "epoch": 17.923076923076923, |
| "grad_norm": 3.54429030418396, |
| "learning_rate": 1.0384615384615386e-06, |
| "loss": 0.0704, |
| "step": 2097 |
| }, |
| { |
| "epoch": 17.931623931623932, |
| "grad_norm": 1.1745219230651855, |
| "learning_rate": 1.034188034188034e-06, |
| "loss": 0.0418, |
| "step": 2098 |
| }, |
| { |
| "epoch": 17.94017094017094, |
| "grad_norm": 5.157544136047363, |
| "learning_rate": 1.02991452991453e-06, |
| "loss": 0.2562, |
| "step": 2099 |
| }, |
| { |
| "epoch": 17.94871794871795, |
| "grad_norm": 4.454767227172852, |
| "learning_rate": 1.0256410256410257e-06, |
| "loss": 0.1141, |
| "step": 2100 |
| }, |
| { |
| "epoch": 17.957264957264957, |
| "grad_norm": 12.859573364257812, |
| "learning_rate": 1.0213675213675213e-06, |
| "loss": 0.3516, |
| "step": 2101 |
| }, |
| { |
| "epoch": 17.965811965811966, |
| "grad_norm": 5.780513763427734, |
| "learning_rate": 1.0170940170940172e-06, |
| "loss": 0.1663, |
| "step": 2102 |
| }, |
| { |
| "epoch": 17.974358974358974, |
| "grad_norm": 2.762153387069702, |
| "learning_rate": 1.012820512820513e-06, |
| "loss": 0.19, |
| "step": 2103 |
| }, |
| { |
| "epoch": 17.982905982905983, |
| "grad_norm": 5.649252891540527, |
| "learning_rate": 1.0085470085470086e-06, |
| "loss": 0.1736, |
| "step": 2104 |
| }, |
| { |
| "epoch": 17.99145299145299, |
| "grad_norm": 5.10836124420166, |
| "learning_rate": 1.0042735042735045e-06, |
| "loss": 0.1739, |
| "step": 2105 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 6.474237442016602, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.3239, |
| "step": 2106 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.052614517509937286, |
| "eval_runtime": 9.28, |
| "eval_samples_per_second": 50.216, |
| "eval_steps_per_second": 6.358, |
| "step": 2106 |
| }, |
| { |
| "epoch": 18.00854700854701, |
| "grad_norm": 0.8820164203643799, |
| "learning_rate": 9.957264957264958e-07, |
| "loss": 0.0237, |
| "step": 2107 |
| }, |
| { |
| "epoch": 18.017094017094017, |
| "grad_norm": 2.692166566848755, |
| "learning_rate": 9.914529914529915e-07, |
| "loss": 0.0962, |
| "step": 2108 |
| }, |
| { |
| "epoch": 18.025641025641026, |
| "grad_norm": 0.8048399090766907, |
| "learning_rate": 9.871794871794872e-07, |
| "loss": 0.0232, |
| "step": 2109 |
| }, |
| { |
| "epoch": 18.034188034188034, |
| "grad_norm": 4.4439826011657715, |
| "learning_rate": 9.829059829059829e-07, |
| "loss": 0.064, |
| "step": 2110 |
| }, |
| { |
| "epoch": 18.042735042735043, |
| "grad_norm": 1.62433660030365, |
| "learning_rate": 9.786324786324788e-07, |
| "loss": 0.1263, |
| "step": 2111 |
| }, |
| { |
| "epoch": 18.05128205128205, |
| "grad_norm": 4.766104221343994, |
| "learning_rate": 9.743589743589745e-07, |
| "loss": 0.2108, |
| "step": 2112 |
| }, |
| { |
| "epoch": 18.05982905982906, |
| "grad_norm": 139.34445190429688, |
| "learning_rate": 9.700854700854701e-07, |
| "loss": 0.237, |
| "step": 2113 |
| }, |
| { |
| "epoch": 18.068376068376068, |
| "grad_norm": 0.6069220900535583, |
| "learning_rate": 9.65811965811966e-07, |
| "loss": 0.0135, |
| "step": 2114 |
| }, |
| { |
| "epoch": 18.076923076923077, |
| "grad_norm": 2.7833995819091797, |
| "learning_rate": 9.615384615384617e-07, |
| "loss": 0.1677, |
| "step": 2115 |
| }, |
| { |
| "epoch": 18.085470085470085, |
| "grad_norm": 4.570268630981445, |
| "learning_rate": 9.572649572649574e-07, |
| "loss": 0.2304, |
| "step": 2116 |
| }, |
| { |
| "epoch": 18.094017094017094, |
| "grad_norm": 4.7644805908203125, |
| "learning_rate": 9.529914529914531e-07, |
| "loss": 0.138, |
| "step": 2117 |
| }, |
| { |
| "epoch": 18.102564102564102, |
| "grad_norm": 1.9438762664794922, |
| "learning_rate": 9.487179487179487e-07, |
| "loss": 0.0488, |
| "step": 2118 |
| }, |
| { |
| "epoch": 18.11111111111111, |
| "grad_norm": 1.4188040494918823, |
| "learning_rate": 9.444444444444445e-07, |
| "loss": 0.0545, |
| "step": 2119 |
| }, |
| { |
| "epoch": 18.11965811965812, |
| "grad_norm": 0.357928603887558, |
| "learning_rate": 9.401709401709402e-07, |
| "loss": 0.0092, |
| "step": 2120 |
| }, |
| { |
| "epoch": 18.128205128205128, |
| "grad_norm": 1.8646256923675537, |
| "learning_rate": 9.35897435897436e-07, |
| "loss": 0.086, |
| "step": 2121 |
| }, |
| { |
| "epoch": 18.136752136752136, |
| "grad_norm": 2.111544609069824, |
| "learning_rate": 9.316239316239318e-07, |
| "loss": 0.0319, |
| "step": 2122 |
| }, |
| { |
| "epoch": 18.145299145299145, |
| "grad_norm": 3.0686893463134766, |
| "learning_rate": 9.273504273504274e-07, |
| "loss": 0.0689, |
| "step": 2123 |
| }, |
| { |
| "epoch": 18.153846153846153, |
| "grad_norm": 4.028079509735107, |
| "learning_rate": 9.230769230769232e-07, |
| "loss": 0.125, |
| "step": 2124 |
| }, |
| { |
| "epoch": 18.162393162393162, |
| "grad_norm": 1.0433181524276733, |
| "learning_rate": 9.188034188034189e-07, |
| "loss": 0.0174, |
| "step": 2125 |
| }, |
| { |
| "epoch": 18.17094017094017, |
| "grad_norm": 3.4533402919769287, |
| "learning_rate": 9.145299145299146e-07, |
| "loss": 0.1556, |
| "step": 2126 |
| }, |
| { |
| "epoch": 18.17948717948718, |
| "grad_norm": 11.187241554260254, |
| "learning_rate": 9.102564102564103e-07, |
| "loss": 0.2578, |
| "step": 2127 |
| }, |
| { |
| "epoch": 18.188034188034187, |
| "grad_norm": 2.544975757598877, |
| "learning_rate": 9.05982905982906e-07, |
| "loss": 0.0868, |
| "step": 2128 |
| }, |
| { |
| "epoch": 18.196581196581196, |
| "grad_norm": 2.490493059158325, |
| "learning_rate": 9.017094017094017e-07, |
| "loss": 0.1575, |
| "step": 2129 |
| }, |
| { |
| "epoch": 18.205128205128204, |
| "grad_norm": 4.665895938873291, |
| "learning_rate": 8.974358974358975e-07, |
| "loss": 0.1644, |
| "step": 2130 |
| }, |
| { |
| "epoch": 18.213675213675213, |
| "grad_norm": 3.135772943496704, |
| "learning_rate": 8.931623931623933e-07, |
| "loss": 0.205, |
| "step": 2131 |
| }, |
| { |
| "epoch": 18.22222222222222, |
| "grad_norm": 1.5636606216430664, |
| "learning_rate": 8.88888888888889e-07, |
| "loss": 0.0541, |
| "step": 2132 |
| }, |
| { |
| "epoch": 18.23076923076923, |
| "grad_norm": 3.603691816329956, |
| "learning_rate": 8.846153846153848e-07, |
| "loss": 0.0478, |
| "step": 2133 |
| }, |
| { |
| "epoch": 18.23931623931624, |
| "grad_norm": 2.6537222862243652, |
| "learning_rate": 8.803418803418804e-07, |
| "loss": 0.1206, |
| "step": 2134 |
| }, |
| { |
| "epoch": 18.247863247863247, |
| "grad_norm": 5.086421966552734, |
| "learning_rate": 8.760683760683761e-07, |
| "loss": 0.1212, |
| "step": 2135 |
| }, |
| { |
| "epoch": 18.256410256410255, |
| "grad_norm": 4.673394203186035, |
| "learning_rate": 8.717948717948718e-07, |
| "loss": 0.0588, |
| "step": 2136 |
| }, |
| { |
| "epoch": 18.264957264957264, |
| "grad_norm": 2.1376845836639404, |
| "learning_rate": 8.675213675213676e-07, |
| "loss": 0.0492, |
| "step": 2137 |
| }, |
| { |
| "epoch": 18.273504273504273, |
| "grad_norm": 2.8616504669189453, |
| "learning_rate": 8.632478632478633e-07, |
| "loss": 0.1834, |
| "step": 2138 |
| }, |
| { |
| "epoch": 18.28205128205128, |
| "grad_norm": 2.7179784774780273, |
| "learning_rate": 8.58974358974359e-07, |
| "loss": 0.1508, |
| "step": 2139 |
| }, |
| { |
| "epoch": 18.29059829059829, |
| "grad_norm": 1.1909416913986206, |
| "learning_rate": 8.547008547008548e-07, |
| "loss": 0.0721, |
| "step": 2140 |
| }, |
| { |
| "epoch": 18.299145299145298, |
| "grad_norm": 1.8272216320037842, |
| "learning_rate": 8.504273504273505e-07, |
| "loss": 0.0797, |
| "step": 2141 |
| }, |
| { |
| "epoch": 18.307692307692307, |
| "grad_norm": 4.394528388977051, |
| "learning_rate": 8.461538461538463e-07, |
| "loss": 0.2762, |
| "step": 2142 |
| }, |
| { |
| "epoch": 18.316239316239315, |
| "grad_norm": 4.276169776916504, |
| "learning_rate": 8.41880341880342e-07, |
| "loss": 0.0969, |
| "step": 2143 |
| }, |
| { |
| "epoch": 18.324786324786324, |
| "grad_norm": 2.0932376384735107, |
| "learning_rate": 8.376068376068377e-07, |
| "loss": 0.0595, |
| "step": 2144 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 5.714378833770752, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 0.1176, |
| "step": 2145 |
| }, |
| { |
| "epoch": 18.34188034188034, |
| "grad_norm": 1.1050394773483276, |
| "learning_rate": 8.290598290598291e-07, |
| "loss": 0.0284, |
| "step": 2146 |
| }, |
| { |
| "epoch": 18.35042735042735, |
| "grad_norm": 3.2809271812438965, |
| "learning_rate": 8.247863247863248e-07, |
| "loss": 0.0737, |
| "step": 2147 |
| }, |
| { |
| "epoch": 18.358974358974358, |
| "grad_norm": 2.102889060974121, |
| "learning_rate": 8.205128205128206e-07, |
| "loss": 0.0477, |
| "step": 2148 |
| }, |
| { |
| "epoch": 18.367521367521366, |
| "grad_norm": 1.5728402137756348, |
| "learning_rate": 8.162393162393164e-07, |
| "loss": 0.0476, |
| "step": 2149 |
| }, |
| { |
| "epoch": 18.376068376068375, |
| "grad_norm": 2.0337905883789062, |
| "learning_rate": 8.11965811965812e-07, |
| "loss": 0.019, |
| "step": 2150 |
| }, |
| { |
| "epoch": 18.384615384615383, |
| "grad_norm": 5.475340843200684, |
| "learning_rate": 8.076923076923078e-07, |
| "loss": 0.1625, |
| "step": 2151 |
| }, |
| { |
| "epoch": 18.39316239316239, |
| "grad_norm": 0.4993753135204315, |
| "learning_rate": 8.034188034188035e-07, |
| "loss": 0.0132, |
| "step": 2152 |
| }, |
| { |
| "epoch": 18.4017094017094, |
| "grad_norm": 4.052933216094971, |
| "learning_rate": 7.991452991452992e-07, |
| "loss": 0.1603, |
| "step": 2153 |
| }, |
| { |
| "epoch": 18.41025641025641, |
| "grad_norm": 3.005293607711792, |
| "learning_rate": 7.948717948717949e-07, |
| "loss": 0.0399, |
| "step": 2154 |
| }, |
| { |
| "epoch": 18.418803418803417, |
| "grad_norm": 3.0186731815338135, |
| "learning_rate": 7.905982905982906e-07, |
| "loss": 0.0564, |
| "step": 2155 |
| }, |
| { |
| "epoch": 18.427350427350426, |
| "grad_norm": 5.522226333618164, |
| "learning_rate": 7.863247863247863e-07, |
| "loss": 0.1138, |
| "step": 2156 |
| }, |
| { |
| "epoch": 18.435897435897434, |
| "grad_norm": 5.463916301727295, |
| "learning_rate": 7.820512820512821e-07, |
| "loss": 0.4811, |
| "step": 2157 |
| }, |
| { |
| "epoch": 18.444444444444443, |
| "grad_norm": 0.41404595971107483, |
| "learning_rate": 7.777777777777779e-07, |
| "loss": 0.0114, |
| "step": 2158 |
| }, |
| { |
| "epoch": 18.45299145299145, |
| "grad_norm": 0.9279537200927734, |
| "learning_rate": 7.735042735042736e-07, |
| "loss": 0.0268, |
| "step": 2159 |
| }, |
| { |
| "epoch": 18.46153846153846, |
| "grad_norm": 0.5745738744735718, |
| "learning_rate": 7.692307692307694e-07, |
| "loss": 0.0155, |
| "step": 2160 |
| }, |
| { |
| "epoch": 18.47008547008547, |
| "grad_norm": 2.329507827758789, |
| "learning_rate": 7.64957264957265e-07, |
| "loss": 0.0421, |
| "step": 2161 |
| }, |
| { |
| "epoch": 18.478632478632477, |
| "grad_norm": 2.934424638748169, |
| "learning_rate": 7.606837606837607e-07, |
| "loss": 0.0925, |
| "step": 2162 |
| }, |
| { |
| "epoch": 18.487179487179485, |
| "grad_norm": 3.226261854171753, |
| "learning_rate": 7.564102564102564e-07, |
| "loss": 0.1914, |
| "step": 2163 |
| }, |
| { |
| "epoch": 18.495726495726494, |
| "grad_norm": 1.2033684253692627, |
| "learning_rate": 7.521367521367522e-07, |
| "loss": 0.0218, |
| "step": 2164 |
| }, |
| { |
| "epoch": 18.504273504273506, |
| "grad_norm": 1.092015266418457, |
| "learning_rate": 7.478632478632479e-07, |
| "loss": 0.0165, |
| "step": 2165 |
| }, |
| { |
| "epoch": 18.51282051282051, |
| "grad_norm": 1.2283809185028076, |
| "learning_rate": 7.435897435897436e-07, |
| "loss": 0.025, |
| "step": 2166 |
| }, |
| { |
| "epoch": 18.521367521367523, |
| "grad_norm": 6.3457722663879395, |
| "learning_rate": 7.393162393162394e-07, |
| "loss": 0.2224, |
| "step": 2167 |
| }, |
| { |
| "epoch": 18.52991452991453, |
| "grad_norm": 4.920536518096924, |
| "learning_rate": 7.350427350427351e-07, |
| "loss": 0.1381, |
| "step": 2168 |
| }, |
| { |
| "epoch": 18.53846153846154, |
| "grad_norm": 4.16088342666626, |
| "learning_rate": 7.307692307692309e-07, |
| "loss": 0.2725, |
| "step": 2169 |
| }, |
| { |
| "epoch": 18.54700854700855, |
| "grad_norm": 1.4776932001113892, |
| "learning_rate": 7.264957264957266e-07, |
| "loss": 0.0236, |
| "step": 2170 |
| }, |
| { |
| "epoch": 18.555555555555557, |
| "grad_norm": 5.517492294311523, |
| "learning_rate": 7.222222222222222e-07, |
| "loss": 0.3427, |
| "step": 2171 |
| }, |
| { |
| "epoch": 18.564102564102566, |
| "grad_norm": 0.7798398733139038, |
| "learning_rate": 7.179487179487179e-07, |
| "loss": 0.0139, |
| "step": 2172 |
| }, |
| { |
| "epoch": 18.572649572649574, |
| "grad_norm": 0.7174245119094849, |
| "learning_rate": 7.136752136752137e-07, |
| "loss": 0.0144, |
| "step": 2173 |
| }, |
| { |
| "epoch": 18.581196581196583, |
| "grad_norm": 5.118779182434082, |
| "learning_rate": 7.094017094017094e-07, |
| "loss": 0.1899, |
| "step": 2174 |
| }, |
| { |
| "epoch": 18.58974358974359, |
| "grad_norm": 2.8726353645324707, |
| "learning_rate": 7.051282051282052e-07, |
| "loss": 0.1177, |
| "step": 2175 |
| }, |
| { |
| "epoch": 18.5982905982906, |
| "grad_norm": 2.3775036334991455, |
| "learning_rate": 7.00854700854701e-07, |
| "loss": 0.1183, |
| "step": 2176 |
| }, |
| { |
| "epoch": 18.60683760683761, |
| "grad_norm": 19.23975944519043, |
| "learning_rate": 6.965811965811966e-07, |
| "loss": 0.4534, |
| "step": 2177 |
| }, |
| { |
| "epoch": 18.615384615384617, |
| "grad_norm": 1.3832803964614868, |
| "learning_rate": 6.923076923076924e-07, |
| "loss": 0.0309, |
| "step": 2178 |
| }, |
| { |
| "epoch": 18.623931623931625, |
| "grad_norm": 1.6752214431762695, |
| "learning_rate": 6.880341880341881e-07, |
| "loss": 0.0201, |
| "step": 2179 |
| }, |
| { |
| "epoch": 18.632478632478634, |
| "grad_norm": 3.1885950565338135, |
| "learning_rate": 6.837606837606839e-07, |
| "loss": 0.1242, |
| "step": 2180 |
| }, |
| { |
| "epoch": 18.641025641025642, |
| "grad_norm": 0.9290790557861328, |
| "learning_rate": 6.794871794871795e-07, |
| "loss": 0.0189, |
| "step": 2181 |
| }, |
| { |
| "epoch": 18.64957264957265, |
| "grad_norm": 0.25725051760673523, |
| "learning_rate": 6.752136752136752e-07, |
| "loss": 0.0065, |
| "step": 2182 |
| }, |
| { |
| "epoch": 18.65811965811966, |
| "grad_norm": 1.9815839529037476, |
| "learning_rate": 6.709401709401709e-07, |
| "loss": 0.0576, |
| "step": 2183 |
| }, |
| { |
| "epoch": 18.666666666666668, |
| "grad_norm": 1.924490213394165, |
| "learning_rate": 6.666666666666667e-07, |
| "loss": 0.0671, |
| "step": 2184 |
| }, |
| { |
| "epoch": 18.675213675213676, |
| "grad_norm": 2.9947164058685303, |
| "learning_rate": 6.623931623931625e-07, |
| "loss": 0.1859, |
| "step": 2185 |
| }, |
| { |
| "epoch": 18.683760683760685, |
| "grad_norm": 1.8680211305618286, |
| "learning_rate": 6.581196581196582e-07, |
| "loss": 0.1028, |
| "step": 2186 |
| }, |
| { |
| "epoch": 18.692307692307693, |
| "grad_norm": 0.823103666305542, |
| "learning_rate": 6.53846153846154e-07, |
| "loss": 0.0198, |
| "step": 2187 |
| }, |
| { |
| "epoch": 18.700854700854702, |
| "grad_norm": 2.3616061210632324, |
| "learning_rate": 6.495726495726496e-07, |
| "loss": 0.1025, |
| "step": 2188 |
| }, |
| { |
| "epoch": 18.70940170940171, |
| "grad_norm": 3.1370067596435547, |
| "learning_rate": 6.452991452991454e-07, |
| "loss": 0.0438, |
| "step": 2189 |
| }, |
| { |
| "epoch": 18.71794871794872, |
| "grad_norm": 8.058025360107422, |
| "learning_rate": 6.41025641025641e-07, |
| "loss": 0.082, |
| "step": 2190 |
| }, |
| { |
| "epoch": 18.726495726495727, |
| "grad_norm": 2.1969916820526123, |
| "learning_rate": 6.367521367521368e-07, |
| "loss": 0.1074, |
| "step": 2191 |
| }, |
| { |
| "epoch": 18.735042735042736, |
| "grad_norm": 2.5845255851745605, |
| "learning_rate": 6.324786324786325e-07, |
| "loss": 0.0795, |
| "step": 2192 |
| }, |
| { |
| "epoch": 18.743589743589745, |
| "grad_norm": 3.578331708908081, |
| "learning_rate": 6.282051282051282e-07, |
| "loss": 0.1111, |
| "step": 2193 |
| }, |
| { |
| "epoch": 18.752136752136753, |
| "grad_norm": 1.5390626192092896, |
| "learning_rate": 6.23931623931624e-07, |
| "loss": 0.064, |
| "step": 2194 |
| }, |
| { |
| "epoch": 18.76068376068376, |
| "grad_norm": 3.1742804050445557, |
| "learning_rate": 6.196581196581197e-07, |
| "loss": 0.0971, |
| "step": 2195 |
| }, |
| { |
| "epoch": 18.76923076923077, |
| "grad_norm": 1.7017542123794556, |
| "learning_rate": 6.153846153846155e-07, |
| "loss": 0.0424, |
| "step": 2196 |
| }, |
| { |
| "epoch": 18.77777777777778, |
| "grad_norm": 2.642102003097534, |
| "learning_rate": 6.111111111111112e-07, |
| "loss": 0.1243, |
| "step": 2197 |
| }, |
| { |
| "epoch": 18.786324786324787, |
| "grad_norm": 1.2010291814804077, |
| "learning_rate": 6.068376068376068e-07, |
| "loss": 0.0375, |
| "step": 2198 |
| }, |
| { |
| "epoch": 18.794871794871796, |
| "grad_norm": 3.1580190658569336, |
| "learning_rate": 6.025641025641026e-07, |
| "loss": 0.0565, |
| "step": 2199 |
| }, |
| { |
| "epoch": 18.803418803418804, |
| "grad_norm": 2.7660391330718994, |
| "learning_rate": 5.982905982905984e-07, |
| "loss": 0.0385, |
| "step": 2200 |
| }, |
| { |
| "epoch": 18.811965811965813, |
| "grad_norm": 0.7716617584228516, |
| "learning_rate": 5.94017094017094e-07, |
| "loss": 0.0159, |
| "step": 2201 |
| }, |
| { |
| "epoch": 18.82051282051282, |
| "grad_norm": 3.190251588821411, |
| "learning_rate": 5.897435897435898e-07, |
| "loss": 0.241, |
| "step": 2202 |
| }, |
| { |
| "epoch": 18.82905982905983, |
| "grad_norm": 7.115220069885254, |
| "learning_rate": 5.854700854700856e-07, |
| "loss": 0.1777, |
| "step": 2203 |
| }, |
| { |
| "epoch": 18.837606837606838, |
| "grad_norm": 5.071573257446289, |
| "learning_rate": 5.811965811965812e-07, |
| "loss": 0.5421, |
| "step": 2204 |
| }, |
| { |
| "epoch": 18.846153846153847, |
| "grad_norm": 3.8419785499572754, |
| "learning_rate": 5.76923076923077e-07, |
| "loss": 0.0784, |
| "step": 2205 |
| }, |
| { |
| "epoch": 18.854700854700855, |
| "grad_norm": 2.8234896659851074, |
| "learning_rate": 5.726495726495727e-07, |
| "loss": 0.1071, |
| "step": 2206 |
| }, |
| { |
| "epoch": 18.863247863247864, |
| "grad_norm": 1.4067480564117432, |
| "learning_rate": 5.683760683760684e-07, |
| "loss": 0.0375, |
| "step": 2207 |
| }, |
| { |
| "epoch": 18.871794871794872, |
| "grad_norm": 2.508589029312134, |
| "learning_rate": 5.641025641025642e-07, |
| "loss": 0.0921, |
| "step": 2208 |
| }, |
| { |
| "epoch": 18.88034188034188, |
| "grad_norm": 7.314038276672363, |
| "learning_rate": 5.598290598290599e-07, |
| "loss": 0.3581, |
| "step": 2209 |
| }, |
| { |
| "epoch": 18.88888888888889, |
| "grad_norm": 4.375041961669922, |
| "learning_rate": 5.555555555555555e-07, |
| "loss": 0.1115, |
| "step": 2210 |
| }, |
| { |
| "epoch": 18.897435897435898, |
| "grad_norm": 4.789741516113281, |
| "learning_rate": 5.512820512820513e-07, |
| "loss": 0.1813, |
| "step": 2211 |
| }, |
| { |
| "epoch": 18.905982905982906, |
| "grad_norm": 3.008720874786377, |
| "learning_rate": 5.470085470085471e-07, |
| "loss": 0.104, |
| "step": 2212 |
| }, |
| { |
| "epoch": 18.914529914529915, |
| "grad_norm": 0.6364433765411377, |
| "learning_rate": 5.427350427350428e-07, |
| "loss": 0.0153, |
| "step": 2213 |
| }, |
| { |
| "epoch": 18.923076923076923, |
| "grad_norm": 1.4009958505630493, |
| "learning_rate": 5.384615384615386e-07, |
| "loss": 0.0499, |
| "step": 2214 |
| }, |
| { |
| "epoch": 18.931623931623932, |
| "grad_norm": 4.53135347366333, |
| "learning_rate": 5.341880341880342e-07, |
| "loss": 0.1021, |
| "step": 2215 |
| }, |
| { |
| "epoch": 18.94017094017094, |
| "grad_norm": 0.7855163216590881, |
| "learning_rate": 5.299145299145299e-07, |
| "loss": 0.0297, |
| "step": 2216 |
| }, |
| { |
| "epoch": 18.94871794871795, |
| "grad_norm": 1.5316343307495117, |
| "learning_rate": 5.256410256410257e-07, |
| "loss": 0.0438, |
| "step": 2217 |
| }, |
| { |
| "epoch": 18.957264957264957, |
| "grad_norm": 1.2713849544525146, |
| "learning_rate": 5.213675213675215e-07, |
| "loss": 0.0311, |
| "step": 2218 |
| }, |
| { |
| "epoch": 18.965811965811966, |
| "grad_norm": 1.612418293952942, |
| "learning_rate": 5.17094017094017e-07, |
| "loss": 0.0796, |
| "step": 2219 |
| }, |
| { |
| "epoch": 18.974358974358974, |
| "grad_norm": 6.046596527099609, |
| "learning_rate": 5.128205128205128e-07, |
| "loss": 0.0835, |
| "step": 2220 |
| }, |
| { |
| "epoch": 18.982905982905983, |
| "grad_norm": 2.527993679046631, |
| "learning_rate": 5.085470085470086e-07, |
| "loss": 0.0448, |
| "step": 2221 |
| }, |
| { |
| "epoch": 18.99145299145299, |
| "grad_norm": 0.9519897699356079, |
| "learning_rate": 5.042735042735043e-07, |
| "loss": 0.0223, |
| "step": 2222 |
| }, |
| { |
| "epoch": 19.0, |
| "grad_norm": 14.08708667755127, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.6753, |
| "step": 2223 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.05170569196343422, |
| "eval_runtime": 9.3972, |
| "eval_samples_per_second": 49.589, |
| "eval_steps_per_second": 6.278, |
| "step": 2223 |
| }, |
| { |
| "epoch": 19.00854700854701, |
| "grad_norm": 5.215019702911377, |
| "learning_rate": 4.957264957264958e-07, |
| "loss": 0.1614, |
| "step": 2224 |
| }, |
| { |
| "epoch": 19.017094017094017, |
| "grad_norm": 2.855567216873169, |
| "learning_rate": 4.914529914529914e-07, |
| "loss": 0.1051, |
| "step": 2225 |
| }, |
| { |
| "epoch": 19.025641025641026, |
| "grad_norm": 4.078762054443359, |
| "learning_rate": 4.871794871794872e-07, |
| "loss": 0.2859, |
| "step": 2226 |
| }, |
| { |
| "epoch": 19.034188034188034, |
| "grad_norm": 0.9259152412414551, |
| "learning_rate": 4.82905982905983e-07, |
| "loss": 0.0257, |
| "step": 2227 |
| }, |
| { |
| "epoch": 19.042735042735043, |
| "grad_norm": 3.629925012588501, |
| "learning_rate": 4.786324786324787e-07, |
| "loss": 0.1283, |
| "step": 2228 |
| }, |
| { |
| "epoch": 19.05128205128205, |
| "grad_norm": 3.104196310043335, |
| "learning_rate": 4.7435897435897437e-07, |
| "loss": 0.0701, |
| "step": 2229 |
| }, |
| { |
| "epoch": 19.05982905982906, |
| "grad_norm": 8.760592460632324, |
| "learning_rate": 4.700854700854701e-07, |
| "loss": 0.5793, |
| "step": 2230 |
| }, |
| { |
| "epoch": 19.068376068376068, |
| "grad_norm": 1.2966917753219604, |
| "learning_rate": 4.658119658119659e-07, |
| "loss": 0.0573, |
| "step": 2231 |
| }, |
| { |
| "epoch": 19.076923076923077, |
| "grad_norm": 1.7045038938522339, |
| "learning_rate": 4.615384615384616e-07, |
| "loss": 0.0497, |
| "step": 2232 |
| }, |
| { |
| "epoch": 19.085470085470085, |
| "grad_norm": 7.805142402648926, |
| "learning_rate": 4.572649572649573e-07, |
| "loss": 0.2898, |
| "step": 2233 |
| }, |
| { |
| "epoch": 19.094017094017094, |
| "grad_norm": 0.5019100308418274, |
| "learning_rate": 4.52991452991453e-07, |
| "loss": 0.0132, |
| "step": 2234 |
| }, |
| { |
| "epoch": 19.102564102564102, |
| "grad_norm": 3.1100540161132812, |
| "learning_rate": 4.4871794871794876e-07, |
| "loss": 0.0874, |
| "step": 2235 |
| }, |
| { |
| "epoch": 19.11111111111111, |
| "grad_norm": 0.40422680974006653, |
| "learning_rate": 4.444444444444445e-07, |
| "loss": 0.012, |
| "step": 2236 |
| }, |
| { |
| "epoch": 19.11965811965812, |
| "grad_norm": 1.2845938205718994, |
| "learning_rate": 4.401709401709402e-07, |
| "loss": 0.0259, |
| "step": 2237 |
| }, |
| { |
| "epoch": 19.128205128205128, |
| "grad_norm": 4.621537208557129, |
| "learning_rate": 4.358974358974359e-07, |
| "loss": 0.246, |
| "step": 2238 |
| }, |
| { |
| "epoch": 19.136752136752136, |
| "grad_norm": 1.1688278913497925, |
| "learning_rate": 4.3162393162393163e-07, |
| "loss": 0.0804, |
| "step": 2239 |
| }, |
| { |
| "epoch": 19.145299145299145, |
| "grad_norm": 10.896872520446777, |
| "learning_rate": 4.273504273504274e-07, |
| "loss": 0.2695, |
| "step": 2240 |
| }, |
| { |
| "epoch": 19.153846153846153, |
| "grad_norm": 2.7485415935516357, |
| "learning_rate": 4.2307692307692315e-07, |
| "loss": 0.0474, |
| "step": 2241 |
| }, |
| { |
| "epoch": 19.162393162393162, |
| "grad_norm": 1.1686739921569824, |
| "learning_rate": 4.188034188034188e-07, |
| "loss": 0.0257, |
| "step": 2242 |
| }, |
| { |
| "epoch": 19.17094017094017, |
| "grad_norm": 3.5579254627227783, |
| "learning_rate": 4.1452991452991456e-07, |
| "loss": 0.0419, |
| "step": 2243 |
| }, |
| { |
| "epoch": 19.17948717948718, |
| "grad_norm": 3.088649034500122, |
| "learning_rate": 4.102564102564103e-07, |
| "loss": 0.1229, |
| "step": 2244 |
| }, |
| { |
| "epoch": 19.188034188034187, |
| "grad_norm": 1.4894665479660034, |
| "learning_rate": 4.05982905982906e-07, |
| "loss": 0.0414, |
| "step": 2245 |
| }, |
| { |
| "epoch": 19.196581196581196, |
| "grad_norm": 5.022091865539551, |
| "learning_rate": 4.0170940170940175e-07, |
| "loss": 0.1423, |
| "step": 2246 |
| }, |
| { |
| "epoch": 19.205128205128204, |
| "grad_norm": 1.6117054224014282, |
| "learning_rate": 3.9743589743589743e-07, |
| "loss": 0.0244, |
| "step": 2247 |
| }, |
| { |
| "epoch": 19.213675213675213, |
| "grad_norm": 0.5429085493087769, |
| "learning_rate": 3.9316239316239316e-07, |
| "loss": 0.0122, |
| "step": 2248 |
| }, |
| { |
| "epoch": 19.22222222222222, |
| "grad_norm": 7.429282188415527, |
| "learning_rate": 3.8888888888888895e-07, |
| "loss": 0.122, |
| "step": 2249 |
| }, |
| { |
| "epoch": 19.23076923076923, |
| "grad_norm": 4.492022514343262, |
| "learning_rate": 3.846153846153847e-07, |
| "loss": 0.3181, |
| "step": 2250 |
| }, |
| { |
| "epoch": 19.23931623931624, |
| "grad_norm": 5.219499588012695, |
| "learning_rate": 3.8034188034188036e-07, |
| "loss": 0.1374, |
| "step": 2251 |
| }, |
| { |
| "epoch": 19.247863247863247, |
| "grad_norm": 3.454345941543579, |
| "learning_rate": 3.760683760683761e-07, |
| "loss": 0.147, |
| "step": 2252 |
| }, |
| { |
| "epoch": 19.256410256410255, |
| "grad_norm": 0.6370477080345154, |
| "learning_rate": 3.717948717948718e-07, |
| "loss": 0.0154, |
| "step": 2253 |
| }, |
| { |
| "epoch": 19.264957264957264, |
| "grad_norm": 1.7189971208572388, |
| "learning_rate": 3.6752136752136755e-07, |
| "loss": 0.0635, |
| "step": 2254 |
| }, |
| { |
| "epoch": 19.273504273504273, |
| "grad_norm": 2.716744899749756, |
| "learning_rate": 3.632478632478633e-07, |
| "loss": 0.0966, |
| "step": 2255 |
| }, |
| { |
| "epoch": 19.28205128205128, |
| "grad_norm": 2.4959864616394043, |
| "learning_rate": 3.5897435897435896e-07, |
| "loss": 0.0779, |
| "step": 2256 |
| }, |
| { |
| "epoch": 19.29059829059829, |
| "grad_norm": 3.625793218612671, |
| "learning_rate": 3.547008547008547e-07, |
| "loss": 0.3238, |
| "step": 2257 |
| }, |
| { |
| "epoch": 19.299145299145298, |
| "grad_norm": 1.8783844709396362, |
| "learning_rate": 3.504273504273505e-07, |
| "loss": 0.0319, |
| "step": 2258 |
| }, |
| { |
| "epoch": 19.307692307692307, |
| "grad_norm": 1.6740922927856445, |
| "learning_rate": 3.461538461538462e-07, |
| "loss": 0.0844, |
| "step": 2259 |
| }, |
| { |
| "epoch": 19.316239316239315, |
| "grad_norm": 2.8891098499298096, |
| "learning_rate": 3.4188034188034194e-07, |
| "loss": 0.1916, |
| "step": 2260 |
| }, |
| { |
| "epoch": 19.324786324786324, |
| "grad_norm": 0.9975456595420837, |
| "learning_rate": 3.376068376068376e-07, |
| "loss": 0.0266, |
| "step": 2261 |
| }, |
| { |
| "epoch": 19.333333333333332, |
| "grad_norm": 2.576789379119873, |
| "learning_rate": 3.3333333333333335e-07, |
| "loss": 0.0722, |
| "step": 2262 |
| }, |
| { |
| "epoch": 19.34188034188034, |
| "grad_norm": 9.070858001708984, |
| "learning_rate": 3.290598290598291e-07, |
| "loss": 0.2998, |
| "step": 2263 |
| }, |
| { |
| "epoch": 19.35042735042735, |
| "grad_norm": 3.052319049835205, |
| "learning_rate": 3.247863247863248e-07, |
| "loss": 0.0435, |
| "step": 2264 |
| }, |
| { |
| "epoch": 19.358974358974358, |
| "grad_norm": 0.8035821318626404, |
| "learning_rate": 3.205128205128205e-07, |
| "loss": 0.0233, |
| "step": 2265 |
| }, |
| { |
| "epoch": 19.367521367521366, |
| "grad_norm": 3.7658371925354004, |
| "learning_rate": 3.1623931623931623e-07, |
| "loss": 0.3007, |
| "step": 2266 |
| }, |
| { |
| "epoch": 19.376068376068375, |
| "grad_norm": 1.210494875907898, |
| "learning_rate": 3.11965811965812e-07, |
| "loss": 0.0344, |
| "step": 2267 |
| }, |
| { |
| "epoch": 19.384615384615383, |
| "grad_norm": 1.1121772527694702, |
| "learning_rate": 3.0769230769230774e-07, |
| "loss": 0.054, |
| "step": 2268 |
| }, |
| { |
| "epoch": 19.39316239316239, |
| "grad_norm": 2.842228412628174, |
| "learning_rate": 3.034188034188034e-07, |
| "loss": 0.0814, |
| "step": 2269 |
| }, |
| { |
| "epoch": 19.4017094017094, |
| "grad_norm": 1.9269556999206543, |
| "learning_rate": 2.991452991452992e-07, |
| "loss": 0.0354, |
| "step": 2270 |
| }, |
| { |
| "epoch": 19.41025641025641, |
| "grad_norm": 7.359715938568115, |
| "learning_rate": 2.948717948717949e-07, |
| "loss": 0.3288, |
| "step": 2271 |
| }, |
| { |
| "epoch": 19.418803418803417, |
| "grad_norm": 1.7621564865112305, |
| "learning_rate": 2.905982905982906e-07, |
| "loss": 0.0313, |
| "step": 2272 |
| }, |
| { |
| "epoch": 19.427350427350426, |
| "grad_norm": 2.5410284996032715, |
| "learning_rate": 2.8632478632478635e-07, |
| "loss": 0.076, |
| "step": 2273 |
| }, |
| { |
| "epoch": 19.435897435897434, |
| "grad_norm": 5.633874416351318, |
| "learning_rate": 2.820512820512821e-07, |
| "loss": 0.1903, |
| "step": 2274 |
| }, |
| { |
| "epoch": 19.444444444444443, |
| "grad_norm": 1.935703158378601, |
| "learning_rate": 2.7777777777777776e-07, |
| "loss": 0.3778, |
| "step": 2275 |
| }, |
| { |
| "epoch": 19.45299145299145, |
| "grad_norm": 7.559366703033447, |
| "learning_rate": 2.7350427350427354e-07, |
| "loss": 0.2684, |
| "step": 2276 |
| }, |
| { |
| "epoch": 19.46153846153846, |
| "grad_norm": 9.240869522094727, |
| "learning_rate": 2.692307692307693e-07, |
| "loss": 0.2982, |
| "step": 2277 |
| }, |
| { |
| "epoch": 19.47008547008547, |
| "grad_norm": 6.940350532531738, |
| "learning_rate": 2.6495726495726495e-07, |
| "loss": 0.3131, |
| "step": 2278 |
| }, |
| { |
| "epoch": 19.478632478632477, |
| "grad_norm": 1.3201594352722168, |
| "learning_rate": 2.6068376068376074e-07, |
| "loss": 0.0191, |
| "step": 2279 |
| }, |
| { |
| "epoch": 19.487179487179485, |
| "grad_norm": 1.626806616783142, |
| "learning_rate": 2.564102564102564e-07, |
| "loss": 0.0361, |
| "step": 2280 |
| }, |
| { |
| "epoch": 19.495726495726494, |
| "grad_norm": 8.687582969665527, |
| "learning_rate": 2.5213675213675215e-07, |
| "loss": 0.1942, |
| "step": 2281 |
| }, |
| { |
| "epoch": 19.504273504273506, |
| "grad_norm": 5.104561805725098, |
| "learning_rate": 2.478632478632479e-07, |
| "loss": 0.1906, |
| "step": 2282 |
| }, |
| { |
| "epoch": 19.51282051282051, |
| "grad_norm": 2.8611207008361816, |
| "learning_rate": 2.435897435897436e-07, |
| "loss": 0.1258, |
| "step": 2283 |
| }, |
| { |
| "epoch": 19.521367521367523, |
| "grad_norm": 1.2258422374725342, |
| "learning_rate": 2.3931623931623934e-07, |
| "loss": 0.0186, |
| "step": 2284 |
| }, |
| { |
| "epoch": 19.52991452991453, |
| "grad_norm": 5.307450294494629, |
| "learning_rate": 2.3504273504273505e-07, |
| "loss": 0.1356, |
| "step": 2285 |
| }, |
| { |
| "epoch": 19.53846153846154, |
| "grad_norm": 2.0854647159576416, |
| "learning_rate": 2.307692307692308e-07, |
| "loss": 0.0533, |
| "step": 2286 |
| }, |
| { |
| "epoch": 19.54700854700855, |
| "grad_norm": 1.8560184240341187, |
| "learning_rate": 2.264957264957265e-07, |
| "loss": 0.048, |
| "step": 2287 |
| }, |
| { |
| "epoch": 19.555555555555557, |
| "grad_norm": 5.781933307647705, |
| "learning_rate": 2.2222222222222224e-07, |
| "loss": 0.2769, |
| "step": 2288 |
| }, |
| { |
| "epoch": 19.564102564102566, |
| "grad_norm": 4.858759880065918, |
| "learning_rate": 2.1794871794871795e-07, |
| "loss": 0.4217, |
| "step": 2289 |
| }, |
| { |
| "epoch": 19.572649572649574, |
| "grad_norm": 3.7598235607147217, |
| "learning_rate": 2.136752136752137e-07, |
| "loss": 0.162, |
| "step": 2290 |
| }, |
| { |
| "epoch": 19.581196581196583, |
| "grad_norm": 0.5706556439399719, |
| "learning_rate": 2.094017094017094e-07, |
| "loss": 0.0151, |
| "step": 2291 |
| }, |
| { |
| "epoch": 19.58974358974359, |
| "grad_norm": 5.697900295257568, |
| "learning_rate": 2.0512820512820514e-07, |
| "loss": 0.1015, |
| "step": 2292 |
| }, |
| { |
| "epoch": 19.5982905982906, |
| "grad_norm": 4.635442733764648, |
| "learning_rate": 2.0085470085470088e-07, |
| "loss": 0.1827, |
| "step": 2293 |
| }, |
| { |
| "epoch": 19.60683760683761, |
| "grad_norm": 3.070131778717041, |
| "learning_rate": 1.9658119658119658e-07, |
| "loss": 0.0802, |
| "step": 2294 |
| }, |
| { |
| "epoch": 19.615384615384617, |
| "grad_norm": 0.979217529296875, |
| "learning_rate": 1.9230769230769234e-07, |
| "loss": 0.0237, |
| "step": 2295 |
| }, |
| { |
| "epoch": 19.623931623931625, |
| "grad_norm": 5.640648365020752, |
| "learning_rate": 1.8803418803418804e-07, |
| "loss": 0.0588, |
| "step": 2296 |
| }, |
| { |
| "epoch": 19.632478632478634, |
| "grad_norm": 7.1512861251831055, |
| "learning_rate": 1.8376068376068378e-07, |
| "loss": 0.1942, |
| "step": 2297 |
| }, |
| { |
| "epoch": 19.641025641025642, |
| "grad_norm": 12.868803024291992, |
| "learning_rate": 1.7948717948717948e-07, |
| "loss": 0.2771, |
| "step": 2298 |
| }, |
| { |
| "epoch": 19.64957264957265, |
| "grad_norm": 2.954000234603882, |
| "learning_rate": 1.7521367521367524e-07, |
| "loss": 0.1124, |
| "step": 2299 |
| }, |
| { |
| "epoch": 19.65811965811966, |
| "grad_norm": 0.47206825017929077, |
| "learning_rate": 1.7094017094017097e-07, |
| "loss": 0.0104, |
| "step": 2300 |
| }, |
| { |
| "epoch": 19.666666666666668, |
| "grad_norm": 0.6243001818656921, |
| "learning_rate": 1.6666666666666668e-07, |
| "loss": 0.0145, |
| "step": 2301 |
| }, |
| { |
| "epoch": 19.675213675213676, |
| "grad_norm": 1.6680350303649902, |
| "learning_rate": 1.623931623931624e-07, |
| "loss": 0.0634, |
| "step": 2302 |
| }, |
| { |
| "epoch": 19.683760683760685, |
| "grad_norm": 6.298573017120361, |
| "learning_rate": 1.5811965811965811e-07, |
| "loss": 0.2083, |
| "step": 2303 |
| }, |
| { |
| "epoch": 19.692307692307693, |
| "grad_norm": 0.622466504573822, |
| "learning_rate": 1.5384615384615387e-07, |
| "loss": 0.0155, |
| "step": 2304 |
| }, |
| { |
| "epoch": 19.700854700854702, |
| "grad_norm": 2.289080858230591, |
| "learning_rate": 1.495726495726496e-07, |
| "loss": 0.0698, |
| "step": 2305 |
| }, |
| { |
| "epoch": 19.70940170940171, |
| "grad_norm": 13.065472602844238, |
| "learning_rate": 1.452991452991453e-07, |
| "loss": 0.2587, |
| "step": 2306 |
| }, |
| { |
| "epoch": 19.71794871794872, |
| "grad_norm": 0.903513491153717, |
| "learning_rate": 1.4102564102564104e-07, |
| "loss": 0.0222, |
| "step": 2307 |
| }, |
| { |
| "epoch": 19.726495726495727, |
| "grad_norm": 1.3763283491134644, |
| "learning_rate": 1.3675213675213677e-07, |
| "loss": 0.042, |
| "step": 2308 |
| }, |
| { |
| "epoch": 19.735042735042736, |
| "grad_norm": 3.3493802547454834, |
| "learning_rate": 1.3247863247863248e-07, |
| "loss": 0.1042, |
| "step": 2309 |
| }, |
| { |
| "epoch": 19.743589743589745, |
| "grad_norm": 12.862226486206055, |
| "learning_rate": 1.282051282051282e-07, |
| "loss": 0.359, |
| "step": 2310 |
| }, |
| { |
| "epoch": 19.752136752136753, |
| "grad_norm": 5.56069278717041, |
| "learning_rate": 1.2393162393162394e-07, |
| "loss": 0.1645, |
| "step": 2311 |
| }, |
| { |
| "epoch": 19.76068376068376, |
| "grad_norm": 2.900381326675415, |
| "learning_rate": 1.1965811965811967e-07, |
| "loss": 0.1641, |
| "step": 2312 |
| }, |
| { |
| "epoch": 19.76923076923077, |
| "grad_norm": 1.3674333095550537, |
| "learning_rate": 1.153846153846154e-07, |
| "loss": 0.0428, |
| "step": 2313 |
| }, |
| { |
| "epoch": 19.77777777777778, |
| "grad_norm": 2.06278657913208, |
| "learning_rate": 1.1111111111111112e-07, |
| "loss": 0.0404, |
| "step": 2314 |
| }, |
| { |
| "epoch": 19.786324786324787, |
| "grad_norm": 5.760499954223633, |
| "learning_rate": 1.0683760683760685e-07, |
| "loss": 0.1298, |
| "step": 2315 |
| }, |
| { |
| "epoch": 19.794871794871796, |
| "grad_norm": 3.2554516792297363, |
| "learning_rate": 1.0256410256410257e-07, |
| "loss": 0.0432, |
| "step": 2316 |
| }, |
| { |
| "epoch": 19.803418803418804, |
| "grad_norm": 1.7984355688095093, |
| "learning_rate": 9.829059829059829e-08, |
| "loss": 0.0461, |
| "step": 2317 |
| }, |
| { |
| "epoch": 19.811965811965813, |
| "grad_norm": 1.633736491203308, |
| "learning_rate": 9.401709401709402e-08, |
| "loss": 0.0746, |
| "step": 2318 |
| }, |
| { |
| "epoch": 19.82051282051282, |
| "grad_norm": 2.6958866119384766, |
| "learning_rate": 8.974358974358974e-08, |
| "loss": 0.0852, |
| "step": 2319 |
| }, |
| { |
| "epoch": 19.82905982905983, |
| "grad_norm": 0.9744161367416382, |
| "learning_rate": 8.547008547008549e-08, |
| "loss": 0.0368, |
| "step": 2320 |
| }, |
| { |
| "epoch": 19.837606837606838, |
| "grad_norm": 1.2404037714004517, |
| "learning_rate": 8.11965811965812e-08, |
| "loss": 0.0547, |
| "step": 2321 |
| }, |
| { |
| "epoch": 19.846153846153847, |
| "grad_norm": 1.6044564247131348, |
| "learning_rate": 7.692307692307694e-08, |
| "loss": 0.0441, |
| "step": 2322 |
| }, |
| { |
| "epoch": 19.854700854700855, |
| "grad_norm": 0.47167596220970154, |
| "learning_rate": 7.264957264957265e-08, |
| "loss": 0.0099, |
| "step": 2323 |
| }, |
| { |
| "epoch": 19.863247863247864, |
| "grad_norm": 1.6729376316070557, |
| "learning_rate": 6.837606837606839e-08, |
| "loss": 0.0258, |
| "step": 2324 |
| }, |
| { |
| "epoch": 19.871794871794872, |
| "grad_norm": 0.5823857188224792, |
| "learning_rate": 6.41025641025641e-08, |
| "loss": 0.0131, |
| "step": 2325 |
| }, |
| { |
| "epoch": 19.88034188034188, |
| "grad_norm": 4.055545806884766, |
| "learning_rate": 5.982905982905984e-08, |
| "loss": 0.073, |
| "step": 2326 |
| }, |
| { |
| "epoch": 19.88888888888889, |
| "grad_norm": 2.693838596343994, |
| "learning_rate": 5.555555555555556e-08, |
| "loss": 0.0845, |
| "step": 2327 |
| }, |
| { |
| "epoch": 19.897435897435898, |
| "grad_norm": 0.9895898103713989, |
| "learning_rate": 5.1282051282051286e-08, |
| "loss": 0.0205, |
| "step": 2328 |
| }, |
| { |
| "epoch": 19.905982905982906, |
| "grad_norm": 3.560816526412964, |
| "learning_rate": 4.700854700854701e-08, |
| "loss": 0.0989, |
| "step": 2329 |
| }, |
| { |
| "epoch": 19.914529914529915, |
| "grad_norm": 5.152528762817383, |
| "learning_rate": 4.273504273504274e-08, |
| "loss": 0.0133, |
| "step": 2330 |
| }, |
| { |
| "epoch": 19.923076923076923, |
| "grad_norm": 1.709021806716919, |
| "learning_rate": 3.846153846153847e-08, |
| "loss": 0.068, |
| "step": 2331 |
| }, |
| { |
| "epoch": 19.931623931623932, |
| "grad_norm": 0.4786951541900635, |
| "learning_rate": 3.418803418803419e-08, |
| "loss": 0.0141, |
| "step": 2332 |
| }, |
| { |
| "epoch": 19.94017094017094, |
| "grad_norm": 1.5413727760314941, |
| "learning_rate": 2.991452991452992e-08, |
| "loss": 0.0246, |
| "step": 2333 |
| }, |
| { |
| "epoch": 19.94871794871795, |
| "grad_norm": 1.019601583480835, |
| "learning_rate": 2.5641025641025643e-08, |
| "loss": 0.0199, |
| "step": 2334 |
| }, |
| { |
| "epoch": 19.957264957264957, |
| "grad_norm": 1.6115524768829346, |
| "learning_rate": 2.136752136752137e-08, |
| "loss": 0.0752, |
| "step": 2335 |
| }, |
| { |
| "epoch": 19.965811965811966, |
| "grad_norm": 2.381624698638916, |
| "learning_rate": 1.7094017094017096e-08, |
| "loss": 0.0609, |
| "step": 2336 |
| }, |
| { |
| "epoch": 19.974358974358974, |
| "grad_norm": 1.688704013824463, |
| "learning_rate": 1.2820512820512822e-08, |
| "loss": 0.0419, |
| "step": 2337 |
| }, |
| { |
| "epoch": 19.982905982905983, |
| "grad_norm": 1.643002986907959, |
| "learning_rate": 8.547008547008548e-09, |
| "loss": 0.0456, |
| "step": 2338 |
| }, |
| { |
| "epoch": 19.99145299145299, |
| "grad_norm": 3.5371882915496826, |
| "learning_rate": 4.273504273504274e-09, |
| "loss": 0.0392, |
| "step": 2339 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 4.692568302154541, |
| "learning_rate": 0.0, |
| "loss": 0.1751, |
| "step": 2340 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.051427390426397324, |
| "eval_runtime": 9.301, |
| "eval_samples_per_second": 50.102, |
| "eval_steps_per_second": 6.343, |
| "step": 2340 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2340, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 560912565657600.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|