{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 3795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007905138339920949, "grad_norm": 2.3525755405426025, "learning_rate": 0.0, "loss": 4.0224, "step": 1 }, { "epoch": 0.0015810276679841897, "grad_norm": 2.0600616931915283, "learning_rate": 7.894736842105262e-08, "loss": 4.2118, "step": 2 }, { "epoch": 0.0023715415019762848, "grad_norm": 2.1124753952026367, "learning_rate": 1.5789473684210525e-07, "loss": 4.0496, "step": 3 }, { "epoch": 0.0031620553359683794, "grad_norm": 2.1446008682250977, "learning_rate": 2.368421052631579e-07, "loss": 3.9723, "step": 4 }, { "epoch": 0.003952569169960474, "grad_norm": 2.110840082168579, "learning_rate": 3.157894736842105e-07, "loss": 4.0116, "step": 5 }, { "epoch": 0.0047430830039525695, "grad_norm": 2.163710594177246, "learning_rate": 3.9473684210526315e-07, "loss": 4.2974, "step": 6 }, { "epoch": 0.005533596837944664, "grad_norm": 2.444607734680176, "learning_rate": 4.736842105263158e-07, "loss": 3.8112, "step": 7 }, { "epoch": 0.006324110671936759, "grad_norm": 1.9517990350723267, "learning_rate": 5.526315789473684e-07, "loss": 4.3712, "step": 8 }, { "epoch": 0.0071146245059288534, "grad_norm": 2.086073398590088, "learning_rate": 6.31578947368421e-07, "loss": 4.212, "step": 9 }, { "epoch": 0.007905138339920948, "grad_norm": 2.0169837474823, "learning_rate": 7.105263157894736e-07, "loss": 4.1616, "step": 10 }, { "epoch": 0.008695652173913044, "grad_norm": 2.1588385105133057, "learning_rate": 7.894736842105263e-07, "loss": 3.9589, "step": 11 }, { "epoch": 0.009486166007905139, "grad_norm": 1.8925681114196777, "learning_rate": 8.68421052631579e-07, "loss": 4.5233, "step": 12 }, { "epoch": 0.010276679841897233, "grad_norm": 2.1398088932037354, "learning_rate": 9.473684210526316e-07, "loss": 3.9499, "step": 13 }, { "epoch": 0.011067193675889328, "grad_norm": 1.9372698068618774, "learning_rate": 1.0263157894736843e-06, "loss": 4.3695, "step": 14 }, { "epoch": 0.011857707509881422, "grad_norm": 2.1779675483703613, "learning_rate": 1.1052631578947369e-06, "loss": 4.2988, "step": 15 }, { "epoch": 0.012648221343873518, "grad_norm": 2.0454978942871094, "learning_rate": 1.1842105263157894e-06, "loss": 4.165, "step": 16 }, { "epoch": 0.013438735177865613, "grad_norm": 2.2042953968048096, "learning_rate": 1.263157894736842e-06, "loss": 3.9682, "step": 17 }, { "epoch": 0.014229249011857707, "grad_norm": 2.256051540374756, "learning_rate": 1.3421052631578947e-06, "loss": 3.9352, "step": 18 }, { "epoch": 0.015019762845849802, "grad_norm": 2.1645617485046387, "learning_rate": 1.4210526315789473e-06, "loss": 4.006, "step": 19 }, { "epoch": 0.015810276679841896, "grad_norm": 2.1401028633117676, "learning_rate": 1.5e-06, "loss": 4.2207, "step": 20 }, { "epoch": 0.016600790513833993, "grad_norm": 2.1551826000213623, "learning_rate": 1.5789473684210526e-06, "loss": 4.0551, "step": 21 }, { "epoch": 0.017391304347826087, "grad_norm": 2.313161611557007, "learning_rate": 1.6578947368421056e-06, "loss": 4.0508, "step": 22 }, { "epoch": 0.01818181818181818, "grad_norm": 2.102410078048706, "learning_rate": 1.736842105263158e-06, "loss": 3.9553, "step": 23 }, { "epoch": 0.018972332015810278, "grad_norm": 2.349500894546509, "learning_rate": 1.8157894736842106e-06, "loss": 4.0692, "step": 24 }, { "epoch": 0.019762845849802372, "grad_norm": 2.1570918560028076, "learning_rate": 1.8947368421052632e-06, "loss": 4.0775, "step": 25 }, { "epoch": 0.020553359683794466, "grad_norm": 2.3608903884887695, "learning_rate": 1.973684210526316e-06, "loss": 4.229, "step": 26 }, { "epoch": 0.021343873517786563, "grad_norm": 2.5491955280303955, "learning_rate": 2.0526315789473687e-06, "loss": 3.7455, "step": 27 }, { "epoch": 0.022134387351778657, "grad_norm": 2.239769458770752, "learning_rate": 2.1315789473684212e-06, "loss": 4.2418, "step": 28 }, { "epoch": 0.02292490118577075, "grad_norm": 2.0471608638763428, "learning_rate": 2.2105263157894738e-06, "loss": 4.2996, "step": 29 }, { "epoch": 0.023715415019762844, "grad_norm": 1.9792560338974, "learning_rate": 2.2894736842105263e-06, "loss": 4.1828, "step": 30 }, { "epoch": 0.02450592885375494, "grad_norm": 2.3666114807128906, "learning_rate": 2.368421052631579e-06, "loss": 4.1708, "step": 31 }, { "epoch": 0.025296442687747035, "grad_norm": 2.3661327362060547, "learning_rate": 2.4473684210526314e-06, "loss": 3.6749, "step": 32 }, { "epoch": 0.02608695652173913, "grad_norm": 2.0626816749572754, "learning_rate": 2.526315789473684e-06, "loss": 4.1536, "step": 33 }, { "epoch": 0.026877470355731226, "grad_norm": 2.1796371936798096, "learning_rate": 2.605263157894737e-06, "loss": 3.9562, "step": 34 }, { "epoch": 0.02766798418972332, "grad_norm": 2.166775703430176, "learning_rate": 2.6842105263157895e-06, "loss": 3.8166, "step": 35 }, { "epoch": 0.028458498023715414, "grad_norm": 2.1974940299987793, "learning_rate": 2.763157894736842e-06, "loss": 4.0034, "step": 36 }, { "epoch": 0.02924901185770751, "grad_norm": 2.23091197013855, "learning_rate": 2.8421052631578946e-06, "loss": 3.8891, "step": 37 }, { "epoch": 0.030039525691699605, "grad_norm": 2.2130661010742188, "learning_rate": 2.9210526315789475e-06, "loss": 3.9869, "step": 38 }, { "epoch": 0.0308300395256917, "grad_norm": 2.4537622928619385, "learning_rate": 3e-06, "loss": 3.9677, "step": 39 }, { "epoch": 0.03162055335968379, "grad_norm": 2.1675639152526855, "learning_rate": 2.9999994755807884e-06, "loss": 4.2043, "step": 40 }, { "epoch": 0.03241106719367589, "grad_norm": 2.23058819770813, "learning_rate": 2.9999979023235203e-06, "loss": 3.9211, "step": 41 }, { "epoch": 0.03320158102766799, "grad_norm": 2.1653480529785156, "learning_rate": 2.999995280229295e-06, "loss": 3.9449, "step": 42 }, { "epoch": 0.03399209486166008, "grad_norm": 2.176516056060791, "learning_rate": 2.9999916092999467e-06, "loss": 3.9764, "step": 43 }, { "epoch": 0.034782608695652174, "grad_norm": 2.040138006210327, "learning_rate": 2.9999868895380424e-06, "loss": 4.2153, "step": 44 }, { "epoch": 0.03557312252964427, "grad_norm": 2.1505684852600098, "learning_rate": 2.9999811209468814e-06, "loss": 4.1651, "step": 45 }, { "epoch": 0.03636363636363636, "grad_norm": 2.546290397644043, "learning_rate": 2.9999743035304977e-06, "loss": 4.2715, "step": 46 }, { "epoch": 0.03715415019762846, "grad_norm": 2.4898622035980225, "learning_rate": 2.999966437293659e-06, "loss": 3.9661, "step": 47 }, { "epoch": 0.037944664031620556, "grad_norm": 2.169595956802368, "learning_rate": 2.9999575222418647e-06, "loss": 3.7696, "step": 48 }, { "epoch": 0.03873517786561265, "grad_norm": 2.063196897506714, "learning_rate": 2.999947558381348e-06, "loss": 4.1368, "step": 49 }, { "epoch": 0.039525691699604744, "grad_norm": 2.148139715194702, "learning_rate": 2.9999365457190765e-06, "loss": 3.7512, "step": 50 }, { "epoch": 0.04031620553359684, "grad_norm": 2.4991543292999268, "learning_rate": 2.9999244842627506e-06, "loss": 3.9946, "step": 51 }, { "epoch": 0.04110671936758893, "grad_norm": 2.256162643432617, "learning_rate": 2.999911374020804e-06, "loss": 4.038, "step": 52 }, { "epoch": 0.04189723320158103, "grad_norm": 1.9070383310317993, "learning_rate": 2.9998972150024035e-06, "loss": 4.4979, "step": 53 }, { "epoch": 0.042687747035573126, "grad_norm": 2.1344470977783203, "learning_rate": 2.9998820072174494e-06, "loss": 4.0851, "step": 54 }, { "epoch": 0.043478260869565216, "grad_norm": 2.0585763454437256, "learning_rate": 2.999865750676576e-06, "loss": 4.0656, "step": 55 }, { "epoch": 0.04426877470355731, "grad_norm": 2.3745193481445312, "learning_rate": 2.999848445391149e-06, "loss": 3.4175, "step": 56 }, { "epoch": 0.045059288537549404, "grad_norm": 1.946401596069336, "learning_rate": 2.99983009137327e-06, "loss": 4.2012, "step": 57 }, { "epoch": 0.0458498023715415, "grad_norm": 2.521094799041748, "learning_rate": 2.9998106886357718e-06, "loss": 4.1337, "step": 58 }, { "epoch": 0.0466403162055336, "grad_norm": 2.0097901821136475, "learning_rate": 2.999790237192222e-06, "loss": 4.1367, "step": 59 }, { "epoch": 0.04743083003952569, "grad_norm": 2.1987316608428955, "learning_rate": 2.9997687370569202e-06, "loss": 4.2322, "step": 60 }, { "epoch": 0.048221343873517786, "grad_norm": 2.2189881801605225, "learning_rate": 2.9997461882449e-06, "loss": 3.7895, "step": 61 }, { "epoch": 0.04901185770750988, "grad_norm": 1.925830602645874, "learning_rate": 2.999722590771928e-06, "loss": 4.3131, "step": 62 }, { "epoch": 0.04980237154150197, "grad_norm": 1.9821134805679321, "learning_rate": 2.9996979446545045e-06, "loss": 4.3005, "step": 63 }, { "epoch": 0.05059288537549407, "grad_norm": 2.1638131141662598, "learning_rate": 2.9996722499098623e-06, "loss": 3.9855, "step": 64 }, { "epoch": 0.05138339920948617, "grad_norm": 2.198929786682129, "learning_rate": 2.9996455065559676e-06, "loss": 4.1785, "step": 65 }, { "epoch": 0.05217391304347826, "grad_norm": 2.295278787612915, "learning_rate": 2.999617714611521e-06, "loss": 3.8634, "step": 66 }, { "epoch": 0.052964426877470355, "grad_norm": 2.394866943359375, "learning_rate": 2.9995888740959546e-06, "loss": 3.7341, "step": 67 }, { "epoch": 0.05375494071146245, "grad_norm": 2.1780383586883545, "learning_rate": 2.999558985029434e-06, "loss": 4.0904, "step": 68 }, { "epoch": 0.05454545454545454, "grad_norm": 2.109621286392212, "learning_rate": 2.99952804743286e-06, "loss": 4.4652, "step": 69 }, { "epoch": 0.05533596837944664, "grad_norm": 2.2139735221862793, "learning_rate": 2.999496061327864e-06, "loss": 3.8805, "step": 70 }, { "epoch": 0.05612648221343874, "grad_norm": 2.0347373485565186, "learning_rate": 2.9994630267368104e-06, "loss": 4.1093, "step": 71 }, { "epoch": 0.05691699604743083, "grad_norm": 2.434758424758911, "learning_rate": 2.9994289436827993e-06, "loss": 3.5066, "step": 72 }, { "epoch": 0.057707509881422925, "grad_norm": 2.1831600666046143, "learning_rate": 2.9993938121896626e-06, "loss": 4.1467, "step": 73 }, { "epoch": 0.05849802371541502, "grad_norm": 2.139586925506592, "learning_rate": 2.999357632281964e-06, "loss": 4.0325, "step": 74 }, { "epoch": 0.05928853754940711, "grad_norm": 2.1632754802703857, "learning_rate": 2.9993204039850017e-06, "loss": 4.3309, "step": 75 }, { "epoch": 0.06007905138339921, "grad_norm": 2.0611112117767334, "learning_rate": 2.999282127324808e-06, "loss": 4.1195, "step": 76 }, { "epoch": 0.06086956521739131, "grad_norm": 2.579733371734619, "learning_rate": 2.999242802328145e-06, "loss": 4.0418, "step": 77 }, { "epoch": 0.0616600790513834, "grad_norm": 2.044879198074341, "learning_rate": 2.9992024290225106e-06, "loss": 4.0549, "step": 78 }, { "epoch": 0.062450592885375494, "grad_norm": 2.206369400024414, "learning_rate": 2.9991610074361353e-06, "loss": 3.9663, "step": 79 }, { "epoch": 0.06324110671936758, "grad_norm": 2.158003807067871, "learning_rate": 2.999118537597982e-06, "loss": 4.1496, "step": 80 }, { "epoch": 0.06403162055335969, "grad_norm": 2.1410038471221924, "learning_rate": 2.9990750195377456e-06, "loss": 4.022, "step": 81 }, { "epoch": 0.06482213438735178, "grad_norm": 2.1248581409454346, "learning_rate": 2.9990304532858565e-06, "loss": 4.1119, "step": 82 }, { "epoch": 0.06561264822134387, "grad_norm": 2.295269250869751, "learning_rate": 2.9989848388734755e-06, "loss": 3.7237, "step": 83 }, { "epoch": 0.06640316205533597, "grad_norm": 2.1206767559051514, "learning_rate": 2.9989381763324983e-06, "loss": 4.1627, "step": 84 }, { "epoch": 0.06719367588932806, "grad_norm": 2.088980197906494, "learning_rate": 2.9988904656955515e-06, "loss": 3.8973, "step": 85 }, { "epoch": 0.06798418972332015, "grad_norm": 2.358125925064087, "learning_rate": 2.9988417069959963e-06, "loss": 3.7488, "step": 86 }, { "epoch": 0.06877470355731226, "grad_norm": 2.070671319961548, "learning_rate": 2.998791900267926e-06, "loss": 4.4996, "step": 87 }, { "epoch": 0.06956521739130435, "grad_norm": 2.2888898849487305, "learning_rate": 2.9987410455461667e-06, "loss": 3.8751, "step": 88 }, { "epoch": 0.07035573122529644, "grad_norm": 2.27386474609375, "learning_rate": 2.998689142866277e-06, "loss": 4.2115, "step": 89 }, { "epoch": 0.07114624505928854, "grad_norm": 2.027107000350952, "learning_rate": 2.9986361922645487e-06, "loss": 4.2008, "step": 90 }, { "epoch": 0.07193675889328063, "grad_norm": 2.0253472328186035, "learning_rate": 2.9985821937780066e-06, "loss": 4.263, "step": 91 }, { "epoch": 0.07272727272727272, "grad_norm": 2.368701457977295, "learning_rate": 2.9985271474444078e-06, "loss": 3.8224, "step": 92 }, { "epoch": 0.07351778656126483, "grad_norm": 2.182523727416992, "learning_rate": 2.9984710533022414e-06, "loss": 3.9113, "step": 93 }, { "epoch": 0.07430830039525692, "grad_norm": 1.9031565189361572, "learning_rate": 2.9984139113907305e-06, "loss": 4.5173, "step": 94 }, { "epoch": 0.07509881422924901, "grad_norm": 2.281806468963623, "learning_rate": 2.9983557217498297e-06, "loss": 3.9607, "step": 95 }, { "epoch": 0.07588932806324111, "grad_norm": 2.258915901184082, "learning_rate": 2.9982964844202276e-06, "loss": 3.6496, "step": 96 }, { "epoch": 0.0766798418972332, "grad_norm": 2.0588953495025635, "learning_rate": 2.998236199443343e-06, "loss": 4.0941, "step": 97 }, { "epoch": 0.0774703557312253, "grad_norm": 2.4058196544647217, "learning_rate": 2.998174866861331e-06, "loss": 3.8719, "step": 98 }, { "epoch": 0.0782608695652174, "grad_norm": 2.408834934234619, "learning_rate": 2.998112486717074e-06, "loss": 4.0668, "step": 99 }, { "epoch": 0.07905138339920949, "grad_norm": 2.7144060134887695, "learning_rate": 2.998049059054192e-06, "loss": 4.0527, "step": 100 }, { "epoch": 0.07984189723320158, "grad_norm": 2.2652578353881836, "learning_rate": 2.9979845839170343e-06, "loss": 3.8198, "step": 101 }, { "epoch": 0.08063241106719368, "grad_norm": 2.2039248943328857, "learning_rate": 2.9979190613506836e-06, "loss": 4.0276, "step": 102 }, { "epoch": 0.08142292490118577, "grad_norm": 2.0733706951141357, "learning_rate": 2.9978524914009556e-06, "loss": 4.2908, "step": 103 }, { "epoch": 0.08221343873517786, "grad_norm": 2.1162328720092773, "learning_rate": 2.9977848741143968e-06, "loss": 4.0348, "step": 104 }, { "epoch": 0.08300395256916997, "grad_norm": 2.1010584831237793, "learning_rate": 2.9977162095382873e-06, "loss": 4.0057, "step": 105 }, { "epoch": 0.08379446640316206, "grad_norm": 2.1681580543518066, "learning_rate": 2.9976464977206394e-06, "loss": 4.1719, "step": 106 }, { "epoch": 0.08458498023715415, "grad_norm": 2.1655774116516113, "learning_rate": 2.997575738710198e-06, "loss": 4.061, "step": 107 }, { "epoch": 0.08537549407114625, "grad_norm": 2.3572065830230713, "learning_rate": 2.9975039325564375e-06, "loss": 3.8306, "step": 108 }, { "epoch": 0.08616600790513834, "grad_norm": 1.9788386821746826, "learning_rate": 2.997431079309569e-06, "loss": 4.1457, "step": 109 }, { "epoch": 0.08695652173913043, "grad_norm": 2.628906011581421, "learning_rate": 2.9973571790205313e-06, "loss": 3.8336, "step": 110 }, { "epoch": 0.08774703557312254, "grad_norm": 2.421767473220825, "learning_rate": 2.997282231740999e-06, "loss": 3.9025, "step": 111 }, { "epoch": 0.08853754940711463, "grad_norm": 2.2383229732513428, "learning_rate": 2.9972062375233765e-06, "loss": 4.1001, "step": 112 }, { "epoch": 0.08932806324110672, "grad_norm": 2.129281520843506, "learning_rate": 2.9971291964208006e-06, "loss": 4.0047, "step": 113 }, { "epoch": 0.09011857707509881, "grad_norm": 1.9687330722808838, "learning_rate": 2.997051108487141e-06, "loss": 4.4042, "step": 114 }, { "epoch": 0.09090909090909091, "grad_norm": 2.149487257003784, "learning_rate": 2.9969719737769987e-06, "loss": 4.4825, "step": 115 }, { "epoch": 0.091699604743083, "grad_norm": 2.1691808700561523, "learning_rate": 2.9968917923457063e-06, "loss": 3.9981, "step": 116 }, { "epoch": 0.09249011857707509, "grad_norm": 2.2295241355895996, "learning_rate": 2.996810564249329e-06, "loss": 4.0025, "step": 117 }, { "epoch": 0.0932806324110672, "grad_norm": 2.453299045562744, "learning_rate": 2.9967282895446634e-06, "loss": 4.0145, "step": 118 }, { "epoch": 0.09407114624505929, "grad_norm": 2.9178500175476074, "learning_rate": 2.996644968289238e-06, "loss": 4.0836, "step": 119 }, { "epoch": 0.09486166007905138, "grad_norm": 2.0483267307281494, "learning_rate": 2.9965606005413134e-06, "loss": 4.371, "step": 120 }, { "epoch": 0.09565217391304348, "grad_norm": 2.163691282272339, "learning_rate": 2.996475186359882e-06, "loss": 3.9632, "step": 121 }, { "epoch": 0.09644268774703557, "grad_norm": 2.0353848934173584, "learning_rate": 2.9963887258046664e-06, "loss": 4.1251, "step": 122 }, { "epoch": 0.09723320158102766, "grad_norm": 2.341022491455078, "learning_rate": 2.996301218936124e-06, "loss": 4.4071, "step": 123 }, { "epoch": 0.09802371541501977, "grad_norm": 2.1347267627716064, "learning_rate": 2.9962126658154398e-06, "loss": 4.1514, "step": 124 }, { "epoch": 0.09881422924901186, "grad_norm": 2.237863540649414, "learning_rate": 2.9961230665045335e-06, "loss": 3.7737, "step": 125 }, { "epoch": 0.09960474308300395, "grad_norm": 2.1729657649993896, "learning_rate": 2.996032421066055e-06, "loss": 4.1823, "step": 126 }, { "epoch": 0.10039525691699605, "grad_norm": 2.102367639541626, "learning_rate": 2.9959407295633864e-06, "loss": 4.0461, "step": 127 }, { "epoch": 0.10118577075098814, "grad_norm": 2.0967512130737305, "learning_rate": 2.99584799206064e-06, "loss": 3.8743, "step": 128 }, { "epoch": 0.10197628458498023, "grad_norm": 2.2338128089904785, "learning_rate": 2.9957542086226608e-06, "loss": 3.9604, "step": 129 }, { "epoch": 0.10276679841897234, "grad_norm": 2.149531602859497, "learning_rate": 2.9956593793150235e-06, "loss": 4.207, "step": 130 }, { "epoch": 0.10355731225296443, "grad_norm": 2.560389518737793, "learning_rate": 2.9955635042040366e-06, "loss": 3.8557, "step": 131 }, { "epoch": 0.10434782608695652, "grad_norm": 2.1718666553497314, "learning_rate": 2.995466583356738e-06, "loss": 3.9104, "step": 132 }, { "epoch": 0.10513833992094862, "grad_norm": 2.1646528244018555, "learning_rate": 2.9953686168408964e-06, "loss": 3.87, "step": 133 }, { "epoch": 0.10592885375494071, "grad_norm": 2.1367616653442383, "learning_rate": 2.995269604725014e-06, "loss": 4.1627, "step": 134 }, { "epoch": 0.1067193675889328, "grad_norm": 2.0020968914031982, "learning_rate": 2.9951695470783212e-06, "loss": 4.1564, "step": 135 }, { "epoch": 0.1075098814229249, "grad_norm": 2.477701187133789, "learning_rate": 2.995068443970782e-06, "loss": 4.1258, "step": 136 }, { "epoch": 0.108300395256917, "grad_norm": 2.2424275875091553, "learning_rate": 2.994966295473089e-06, "loss": 4.1132, "step": 137 }, { "epoch": 0.10909090909090909, "grad_norm": 2.350632429122925, "learning_rate": 2.994863101656668e-06, "loss": 4.0768, "step": 138 }, { "epoch": 0.10988142292490119, "grad_norm": 2.0493128299713135, "learning_rate": 2.9947588625936746e-06, "loss": 4.0208, "step": 139 }, { "epoch": 0.11067193675889328, "grad_norm": 2.3842661380767822, "learning_rate": 2.994653578356996e-06, "loss": 3.7812, "step": 140 }, { "epoch": 0.11146245059288537, "grad_norm": 2.2907865047454834, "learning_rate": 2.994547249020248e-06, "loss": 4.088, "step": 141 }, { "epoch": 0.11225296442687747, "grad_norm": 2.357509136199951, "learning_rate": 2.9944398746577805e-06, "loss": 3.3449, "step": 142 }, { "epoch": 0.11304347826086956, "grad_norm": 2.030059576034546, "learning_rate": 2.994331455344671e-06, "loss": 4.2368, "step": 143 }, { "epoch": 0.11383399209486166, "grad_norm": 2.136507511138916, "learning_rate": 2.9942219911567302e-06, "loss": 3.9095, "step": 144 }, { "epoch": 0.11462450592885376, "grad_norm": 2.0529375076293945, "learning_rate": 2.9941114821704978e-06, "loss": 4.2137, "step": 145 }, { "epoch": 0.11541501976284585, "grad_norm": 1.9939981698989868, "learning_rate": 2.9939999284632442e-06, "loss": 4.1806, "step": 146 }, { "epoch": 0.11620553359683794, "grad_norm": 2.4340360164642334, "learning_rate": 2.993887330112971e-06, "loss": 4.0625, "step": 147 }, { "epoch": 0.11699604743083004, "grad_norm": 2.4280951023101807, "learning_rate": 2.9937736871984107e-06, "loss": 3.9424, "step": 148 }, { "epoch": 0.11778656126482213, "grad_norm": 2.1547837257385254, "learning_rate": 2.9936589997990234e-06, "loss": 4.2026, "step": 149 }, { "epoch": 0.11857707509881422, "grad_norm": 2.586975574493408, "learning_rate": 2.993543267995003e-06, "loss": 4.2367, "step": 150 }, { "epoch": 0.11936758893280633, "grad_norm": 2.2812459468841553, "learning_rate": 2.993426491867271e-06, "loss": 3.8642, "step": 151 }, { "epoch": 0.12015810276679842, "grad_norm": 2.190114736557007, "learning_rate": 2.9933086714974815e-06, "loss": 4.2678, "step": 152 }, { "epoch": 0.12094861660079051, "grad_norm": 2.0854249000549316, "learning_rate": 2.993189806968017e-06, "loss": 4.194, "step": 153 }, { "epoch": 0.12173913043478261, "grad_norm": 3.3095147609710693, "learning_rate": 2.99306989836199e-06, "loss": 3.8606, "step": 154 }, { "epoch": 0.1225296442687747, "grad_norm": 2.397836446762085, "learning_rate": 2.992948945763245e-06, "loss": 3.5242, "step": 155 }, { "epoch": 0.1233201581027668, "grad_norm": 2.2648520469665527, "learning_rate": 2.992826949256354e-06, "loss": 3.9142, "step": 156 }, { "epoch": 0.1241106719367589, "grad_norm": 2.0647287368774414, "learning_rate": 2.99270390892662e-06, "loss": 4.3981, "step": 157 }, { "epoch": 0.12490118577075099, "grad_norm": 2.391953945159912, "learning_rate": 2.992579824860077e-06, "loss": 3.8474, "step": 158 }, { "epoch": 0.12569169960474308, "grad_norm": 2.16349196434021, "learning_rate": 2.9924546971434866e-06, "loss": 4.1012, "step": 159 }, { "epoch": 0.12648221343873517, "grad_norm": 2.1659018993377686, "learning_rate": 2.9923285258643422e-06, "loss": 4.035, "step": 160 }, { "epoch": 0.12727272727272726, "grad_norm": 2.19012188911438, "learning_rate": 2.9922013111108657e-06, "loss": 4.3093, "step": 161 }, { "epoch": 0.12806324110671938, "grad_norm": 2.322930335998535, "learning_rate": 2.992073052972009e-06, "loss": 4.1178, "step": 162 }, { "epoch": 0.12885375494071147, "grad_norm": 2.234025478363037, "learning_rate": 2.991943751537453e-06, "loss": 4.0093, "step": 163 }, { "epoch": 0.12964426877470356, "grad_norm": 2.408560037612915, "learning_rate": 2.991813406897609e-06, "loss": 3.8716, "step": 164 }, { "epoch": 0.13043478260869565, "grad_norm": 2.1708648204803467, "learning_rate": 2.991682019143617e-06, "loss": 3.6207, "step": 165 }, { "epoch": 0.13122529644268774, "grad_norm": 2.314877986907959, "learning_rate": 2.9915495883673476e-06, "loss": 4.2877, "step": 166 }, { "epoch": 0.13201581027667983, "grad_norm": 2.201143503189087, "learning_rate": 2.9914161146613986e-06, "loss": 3.538, "step": 167 }, { "epoch": 0.13280632411067195, "grad_norm": 2.551494598388672, "learning_rate": 2.9912815981190988e-06, "loss": 4.0633, "step": 168 }, { "epoch": 0.13359683794466404, "grad_norm": 2.5859265327453613, "learning_rate": 2.9911460388345057e-06, "loss": 3.7173, "step": 169 }, { "epoch": 0.13438735177865613, "grad_norm": 1.992060661315918, "learning_rate": 2.9910094369024054e-06, "loss": 4.3188, "step": 170 }, { "epoch": 0.13517786561264822, "grad_norm": 2.6336441040039062, "learning_rate": 2.990871792418314e-06, "loss": 4.1078, "step": 171 }, { "epoch": 0.1359683794466403, "grad_norm": 2.4614310264587402, "learning_rate": 2.9907331054784755e-06, "loss": 3.9639, "step": 172 }, { "epoch": 0.1367588932806324, "grad_norm": 2.212898015975952, "learning_rate": 2.990593376179864e-06, "loss": 4.3369, "step": 173 }, { "epoch": 0.13754940711462452, "grad_norm": 2.3605518341064453, "learning_rate": 2.990452604620181e-06, "loss": 4.0324, "step": 174 }, { "epoch": 0.1383399209486166, "grad_norm": 2.2147786617279053, "learning_rate": 2.9903107908978583e-06, "loss": 3.88, "step": 175 }, { "epoch": 0.1391304347826087, "grad_norm": 2.455955743789673, "learning_rate": 2.9901679351120557e-06, "loss": 4.0324, "step": 176 }, { "epoch": 0.1399209486166008, "grad_norm": 2.171581506729126, "learning_rate": 2.990024037362661e-06, "loss": 4.0965, "step": 177 }, { "epoch": 0.14071146245059288, "grad_norm": 2.088515281677246, "learning_rate": 2.989879097750292e-06, "loss": 4.0401, "step": 178 }, { "epoch": 0.14150197628458497, "grad_norm": 2.1429121494293213, "learning_rate": 2.989733116376293e-06, "loss": 4.1999, "step": 179 }, { "epoch": 0.1422924901185771, "grad_norm": 2.2390408515930176, "learning_rate": 2.989586093342739e-06, "loss": 3.8794, "step": 180 }, { "epoch": 0.14308300395256918, "grad_norm": 2.141752243041992, "learning_rate": 2.9894380287524325e-06, "loss": 3.9645, "step": 181 }, { "epoch": 0.14387351778656127, "grad_norm": 2.386245012283325, "learning_rate": 2.9892889227089024e-06, "loss": 3.9597, "step": 182 }, { "epoch": 0.14466403162055336, "grad_norm": 2.0590031147003174, "learning_rate": 2.989138775316409e-06, "loss": 4.2575, "step": 183 }, { "epoch": 0.14545454545454545, "grad_norm": 2.223085880279541, "learning_rate": 2.988987586679939e-06, "loss": 4.0669, "step": 184 }, { "epoch": 0.14624505928853754, "grad_norm": 2.759331464767456, "learning_rate": 2.9888353569052068e-06, "loss": 3.8844, "step": 185 }, { "epoch": 0.14703557312252966, "grad_norm": 2.7058396339416504, "learning_rate": 2.9886820860986555e-06, "loss": 3.8974, "step": 186 }, { "epoch": 0.14782608695652175, "grad_norm": 2.695584774017334, "learning_rate": 2.9885277743674565e-06, "loss": 4.1556, "step": 187 }, { "epoch": 0.14861660079051384, "grad_norm": 1.9609112739562988, "learning_rate": 2.9883724218195085e-06, "loss": 4.3026, "step": 188 }, { "epoch": 0.14940711462450593, "grad_norm": 2.0488033294677734, "learning_rate": 2.988216028563437e-06, "loss": 4.1503, "step": 189 }, { "epoch": 0.15019762845849802, "grad_norm": 2.360600233078003, "learning_rate": 2.988058594708597e-06, "loss": 3.8002, "step": 190 }, { "epoch": 0.1509881422924901, "grad_norm": 2.1316990852355957, "learning_rate": 2.98790012036507e-06, "loss": 4.1067, "step": 191 }, { "epoch": 0.15177865612648223, "grad_norm": 2.1589789390563965, "learning_rate": 2.9877406056436654e-06, "loss": 4.2313, "step": 192 }, { "epoch": 0.15256916996047432, "grad_norm": 2.4784228801727295, "learning_rate": 2.9875800506559205e-06, "loss": 3.8985, "step": 193 }, { "epoch": 0.1533596837944664, "grad_norm": 2.183457374572754, "learning_rate": 2.9874184555140983e-06, "loss": 3.8765, "step": 194 }, { "epoch": 0.1541501976284585, "grad_norm": 2.5136313438415527, "learning_rate": 2.9872558203311916e-06, "loss": 4.3545, "step": 195 }, { "epoch": 0.1549407114624506, "grad_norm": 2.1969239711761475, "learning_rate": 2.9870921452209176e-06, "loss": 4.0278, "step": 196 }, { "epoch": 0.15573122529644268, "grad_norm": 2.2775816917419434, "learning_rate": 2.9869274302977235e-06, "loss": 3.7748, "step": 197 }, { "epoch": 0.1565217391304348, "grad_norm": 2.2763266563415527, "learning_rate": 2.9867616756767814e-06, "loss": 3.6021, "step": 198 }, { "epoch": 0.15731225296442689, "grad_norm": 2.3503193855285645, "learning_rate": 2.9865948814739915e-06, "loss": 3.84, "step": 199 }, { "epoch": 0.15810276679841898, "grad_norm": 2.4972989559173584, "learning_rate": 2.9864270478059805e-06, "loss": 4.3005, "step": 200 }, { "epoch": 0.15889328063241107, "grad_norm": 2.2321107387542725, "learning_rate": 2.986258174790102e-06, "loss": 4.1831, "step": 201 }, { "epoch": 0.15968379446640316, "grad_norm": 2.446702718734741, "learning_rate": 2.986088262544436e-06, "loss": 3.8554, "step": 202 }, { "epoch": 0.16047430830039525, "grad_norm": 2.2169532775878906, "learning_rate": 2.98591731118779e-06, "loss": 3.9245, "step": 203 }, { "epoch": 0.16126482213438736, "grad_norm": 2.1441996097564697, "learning_rate": 2.9857453208396974e-06, "loss": 3.9915, "step": 204 }, { "epoch": 0.16205533596837945, "grad_norm": 2.0859811305999756, "learning_rate": 2.9855722916204184e-06, "loss": 4.0543, "step": 205 }, { "epoch": 0.16284584980237155, "grad_norm": 2.4596669673919678, "learning_rate": 2.985398223650939e-06, "loss": 3.8685, "step": 206 }, { "epoch": 0.16363636363636364, "grad_norm": 2.2530593872070312, "learning_rate": 2.9852231170529727e-06, "loss": 3.9494, "step": 207 }, { "epoch": 0.16442687747035573, "grad_norm": 2.0253303050994873, "learning_rate": 2.9850469719489573e-06, "loss": 4.081, "step": 208 }, { "epoch": 0.16521739130434782, "grad_norm": 2.188600540161133, "learning_rate": 2.9848697884620595e-06, "loss": 4.2381, "step": 209 }, { "epoch": 0.16600790513833993, "grad_norm": 2.552016258239746, "learning_rate": 2.98469156671617e-06, "loss": 3.9124, "step": 210 }, { "epoch": 0.16679841897233202, "grad_norm": 2.2695157527923584, "learning_rate": 2.984512306835905e-06, "loss": 4.0974, "step": 211 }, { "epoch": 0.16758893280632411, "grad_norm": 2.1780593395233154, "learning_rate": 2.984332008946609e-06, "loss": 4.0712, "step": 212 }, { "epoch": 0.1683794466403162, "grad_norm": 2.267279863357544, "learning_rate": 2.98415067317435e-06, "loss": 3.7144, "step": 213 }, { "epoch": 0.1691699604743083, "grad_norm": 2.0123801231384277, "learning_rate": 2.9839682996459236e-06, "loss": 4.3308, "step": 214 }, { "epoch": 0.16996047430830039, "grad_norm": 2.3712363243103027, "learning_rate": 2.9837848884888485e-06, "loss": 3.9279, "step": 215 }, { "epoch": 0.1707509881422925, "grad_norm": 2.282470703125, "learning_rate": 2.983600439831372e-06, "loss": 4.0709, "step": 216 }, { "epoch": 0.1715415019762846, "grad_norm": 2.425766706466675, "learning_rate": 2.9834149538024642e-06, "loss": 3.8073, "step": 217 }, { "epoch": 0.17233201581027668, "grad_norm": 2.672893762588501, "learning_rate": 2.983228430531822e-06, "loss": 4.1085, "step": 218 }, { "epoch": 0.17312252964426877, "grad_norm": 2.1799988746643066, "learning_rate": 2.983040870149868e-06, "loss": 3.5691, "step": 219 }, { "epoch": 0.17391304347826086, "grad_norm": 2.2356717586517334, "learning_rate": 2.982852272787748e-06, "loss": 3.9545, "step": 220 }, { "epoch": 0.17470355731225296, "grad_norm": 2.001188039779663, "learning_rate": 2.9826626385773356e-06, "loss": 4.3512, "step": 221 }, { "epoch": 0.17549407114624507, "grad_norm": 2.1264286041259766, "learning_rate": 2.9824719676512263e-06, "loss": 3.9833, "step": 222 }, { "epoch": 0.17628458498023716, "grad_norm": 2.0984225273132324, "learning_rate": 2.982280260142743e-06, "loss": 4.2794, "step": 223 }, { "epoch": 0.17707509881422925, "grad_norm": 1.8731462955474854, "learning_rate": 2.982087516185933e-06, "loss": 4.409, "step": 224 }, { "epoch": 0.17786561264822134, "grad_norm": 2.180933713912964, "learning_rate": 2.9818937359155664e-06, "loss": 4.2965, "step": 225 }, { "epoch": 0.17865612648221343, "grad_norm": 2.0333828926086426, "learning_rate": 2.98169891946714e-06, "loss": 4.2645, "step": 226 }, { "epoch": 0.17944664031620552, "grad_norm": 3.676851749420166, "learning_rate": 2.981503066976875e-06, "loss": 4.0348, "step": 227 }, { "epoch": 0.18023715415019761, "grad_norm": 2.2454257011413574, "learning_rate": 2.981306178581716e-06, "loss": 4.0223, "step": 228 }, { "epoch": 0.18102766798418973, "grad_norm": 3.3748247623443604, "learning_rate": 2.9811082544193324e-06, "loss": 3.7715, "step": 229 }, { "epoch": 0.18181818181818182, "grad_norm": 2.0113582611083984, "learning_rate": 2.9809092946281174e-06, "loss": 4.2319, "step": 230 }, { "epoch": 0.1826086956521739, "grad_norm": 2.3802096843719482, "learning_rate": 2.98070929934719e-06, "loss": 3.6478, "step": 231 }, { "epoch": 0.183399209486166, "grad_norm": 2.189199686050415, "learning_rate": 2.9805082687163904e-06, "loss": 3.8241, "step": 232 }, { "epoch": 0.1841897233201581, "grad_norm": 2.204874038696289, "learning_rate": 2.9803062028762856e-06, "loss": 3.9529, "step": 233 }, { "epoch": 0.18498023715415018, "grad_norm": 2.287152051925659, "learning_rate": 2.9801031019681648e-06, "loss": 3.8721, "step": 234 }, { "epoch": 0.1857707509881423, "grad_norm": 1.994430422782898, "learning_rate": 2.9798989661340416e-06, "loss": 4.2315, "step": 235 }, { "epoch": 0.1865612648221344, "grad_norm": 2.852356433868408, "learning_rate": 2.9796937955166525e-06, "loss": 3.686, "step": 236 }, { "epoch": 0.18735177865612648, "grad_norm": 2.915294647216797, "learning_rate": 2.9794875902594586e-06, "loss": 4.1365, "step": 237 }, { "epoch": 0.18814229249011857, "grad_norm": 2.355799913406372, "learning_rate": 2.9792803505066435e-06, "loss": 3.8014, "step": 238 }, { "epoch": 0.18893280632411066, "grad_norm": 2.099398136138916, "learning_rate": 2.9790720764031148e-06, "loss": 4.0193, "step": 239 }, { "epoch": 0.18972332015810275, "grad_norm": 3.516871690750122, "learning_rate": 2.978862768094503e-06, "loss": 3.8808, "step": 240 }, { "epoch": 0.19051383399209487, "grad_norm": 2.1675100326538086, "learning_rate": 2.9786524257271618e-06, "loss": 4.1971, "step": 241 }, { "epoch": 0.19130434782608696, "grad_norm": 2.693603277206421, "learning_rate": 2.978441049448168e-06, "loss": 3.9965, "step": 242 }, { "epoch": 0.19209486166007905, "grad_norm": 2.1408135890960693, "learning_rate": 2.9782286394053213e-06, "loss": 4.0018, "step": 243 }, { "epoch": 0.19288537549407114, "grad_norm": 2.4696953296661377, "learning_rate": 2.9780151957471443e-06, "loss": 3.7655, "step": 244 }, { "epoch": 0.19367588932806323, "grad_norm": 2.256025552749634, "learning_rate": 2.9778007186228826e-06, "loss": 3.8602, "step": 245 }, { "epoch": 0.19446640316205532, "grad_norm": 2.194545269012451, "learning_rate": 2.977585208182503e-06, "loss": 4.1283, "step": 246 }, { "epoch": 0.19525691699604744, "grad_norm": 2.4966018199920654, "learning_rate": 2.9773686645766964e-06, "loss": 3.8343, "step": 247 }, { "epoch": 0.19604743083003953, "grad_norm": 2.1650993824005127, "learning_rate": 2.9771510879568768e-06, "loss": 4.2552, "step": 248 }, { "epoch": 0.19683794466403162, "grad_norm": 1.8830687999725342, "learning_rate": 2.976932478475178e-06, "loss": 4.3821, "step": 249 }, { "epoch": 0.1976284584980237, "grad_norm": 2.616912603378296, "learning_rate": 2.976712836284457e-06, "loss": 3.9255, "step": 250 }, { "epoch": 0.1984189723320158, "grad_norm": 2.0517795085906982, "learning_rate": 2.9764921615382954e-06, "loss": 4.2615, "step": 251 }, { "epoch": 0.1992094861660079, "grad_norm": 2.071822166442871, "learning_rate": 2.9762704543909925e-06, "loss": 3.9593, "step": 252 }, { "epoch": 0.2, "grad_norm": 2.1346237659454346, "learning_rate": 2.9760477149975723e-06, "loss": 3.9149, "step": 253 }, { "epoch": 0.2007905138339921, "grad_norm": 2.464056968688965, "learning_rate": 2.97582394351378e-06, "loss": 4.1984, "step": 254 }, { "epoch": 0.2015810276679842, "grad_norm": 2.0870628356933594, "learning_rate": 2.9755991400960825e-06, "loss": 4.0865, "step": 255 }, { "epoch": 0.20237154150197628, "grad_norm": 2.2659802436828613, "learning_rate": 2.975373304901668e-06, "loss": 4.0347, "step": 256 }, { "epoch": 0.20316205533596837, "grad_norm": 2.1779799461364746, "learning_rate": 2.975146438088446e-06, "loss": 4.1298, "step": 257 }, { "epoch": 0.20395256916996046, "grad_norm": 2.4848618507385254, "learning_rate": 2.974918539815048e-06, "loss": 3.6173, "step": 258 }, { "epoch": 0.20474308300395258, "grad_norm": 2.0686378479003906, "learning_rate": 2.9746896102408257e-06, "loss": 4.1302, "step": 259 }, { "epoch": 0.20553359683794467, "grad_norm": 1.9876476526260376, "learning_rate": 2.974459649525853e-06, "loss": 4.2731, "step": 260 }, { "epoch": 0.20632411067193676, "grad_norm": 2.245143413543701, "learning_rate": 2.974228657830924e-06, "loss": 3.9929, "step": 261 }, { "epoch": 0.20711462450592885, "grad_norm": 2.024529218673706, "learning_rate": 2.9739966353175543e-06, "loss": 4.2612, "step": 262 }, { "epoch": 0.20790513833992094, "grad_norm": 2.1341793537139893, "learning_rate": 2.97376358214798e-06, "loss": 4.0901, "step": 263 }, { "epoch": 0.20869565217391303, "grad_norm": 2.1550707817077637, "learning_rate": 2.973529498485158e-06, "loss": 4.2586, "step": 264 }, { "epoch": 0.20948616600790515, "grad_norm": 2.0415382385253906, "learning_rate": 2.973294384492765e-06, "loss": 4.1613, "step": 265 }, { "epoch": 0.21027667984189724, "grad_norm": 2.2207934856414795, "learning_rate": 2.9730582403351994e-06, "loss": 4.0422, "step": 266 }, { "epoch": 0.21106719367588933, "grad_norm": 2.145447254180908, "learning_rate": 2.9728210661775785e-06, "loss": 4.2269, "step": 267 }, { "epoch": 0.21185770750988142, "grad_norm": 2.5428085327148438, "learning_rate": 2.972582862185741e-06, "loss": 3.8355, "step": 268 }, { "epoch": 0.2126482213438735, "grad_norm": 2.3855032920837402, "learning_rate": 2.9723436285262457e-06, "loss": 3.6729, "step": 269 }, { "epoch": 0.2134387351778656, "grad_norm": 2.4202897548675537, "learning_rate": 2.97210336536637e-06, "loss": 3.7442, "step": 270 }, { "epoch": 0.21422924901185772, "grad_norm": 2.3004064559936523, "learning_rate": 2.971862072874112e-06, "loss": 3.4012, "step": 271 }, { "epoch": 0.2150197628458498, "grad_norm": 2.6318490505218506, "learning_rate": 2.971619751218191e-06, "loss": 4.1168, "step": 272 }, { "epoch": 0.2158102766798419, "grad_norm": 2.8037307262420654, "learning_rate": 2.971376400568043e-06, "loss": 3.7135, "step": 273 }, { "epoch": 0.216600790513834, "grad_norm": 2.0793139934539795, "learning_rate": 2.9711320210938257e-06, "loss": 4.0505, "step": 274 }, { "epoch": 0.21739130434782608, "grad_norm": 2.6433265209198, "learning_rate": 2.970886612966415e-06, "loss": 3.5403, "step": 275 }, { "epoch": 0.21818181818181817, "grad_norm": 2.3466005325317383, "learning_rate": 2.9706401763574077e-06, "loss": 4.2094, "step": 276 }, { "epoch": 0.2189723320158103, "grad_norm": 2.238758087158203, "learning_rate": 2.970392711439117e-06, "loss": 3.976, "step": 277 }, { "epoch": 0.21976284584980238, "grad_norm": 2.0190906524658203, "learning_rate": 2.9701442183845778e-06, "loss": 4.1802, "step": 278 }, { "epoch": 0.22055335968379447, "grad_norm": 2.8885538578033447, "learning_rate": 2.9698946973675423e-06, "loss": 4.0008, "step": 279 }, { "epoch": 0.22134387351778656, "grad_norm": 2.2258694171905518, "learning_rate": 2.9696441485624824e-06, "loss": 4.0869, "step": 280 }, { "epoch": 0.22213438735177865, "grad_norm": 2.359792947769165, "learning_rate": 2.969392572144588e-06, "loss": 3.872, "step": 281 }, { "epoch": 0.22292490118577074, "grad_norm": 2.6408731937408447, "learning_rate": 2.969139968289768e-06, "loss": 3.7202, "step": 282 }, { "epoch": 0.22371541501976286, "grad_norm": 1.9893240928649902, "learning_rate": 2.9688863371746493e-06, "loss": 4.1534, "step": 283 }, { "epoch": 0.22450592885375495, "grad_norm": 2.288132429122925, "learning_rate": 2.9686316789765767e-06, "loss": 4.3131, "step": 284 }, { "epoch": 0.22529644268774704, "grad_norm": 2.1322882175445557, "learning_rate": 2.968375993873615e-06, "loss": 4.0031, "step": 285 }, { "epoch": 0.22608695652173913, "grad_norm": 2.197998285293579, "learning_rate": 2.9681192820445445e-06, "loss": 3.9447, "step": 286 }, { "epoch": 0.22687747035573122, "grad_norm": 2.1885430812835693, "learning_rate": 2.9678615436688654e-06, "loss": 3.8249, "step": 287 }, { "epoch": 0.2276679841897233, "grad_norm": 2.220344066619873, "learning_rate": 2.967602778926795e-06, "loss": 3.8246, "step": 288 }, { "epoch": 0.22845849802371543, "grad_norm": 2.109727382659912, "learning_rate": 2.9673429879992678e-06, "loss": 4.2768, "step": 289 }, { "epoch": 0.22924901185770752, "grad_norm": 2.295192241668701, "learning_rate": 2.967082171067937e-06, "loss": 4.1556, "step": 290 }, { "epoch": 0.2300395256916996, "grad_norm": 2.3440115451812744, "learning_rate": 2.966820328315172e-06, "loss": 3.8714, "step": 291 }, { "epoch": 0.2308300395256917, "grad_norm": 2.108649969100952, "learning_rate": 2.96655745992406e-06, "loss": 4.1446, "step": 292 }, { "epoch": 0.2316205533596838, "grad_norm": 2.1925888061523438, "learning_rate": 2.966293566078405e-06, "loss": 3.8436, "step": 293 }, { "epoch": 0.23241106719367588, "grad_norm": 2.1752119064331055, "learning_rate": 2.9660286469627283e-06, "loss": 3.9212, "step": 294 }, { "epoch": 0.233201581027668, "grad_norm": 2.265256404876709, "learning_rate": 2.965762702762269e-06, "loss": 4.0376, "step": 295 }, { "epoch": 0.2339920948616601, "grad_norm": 2.4677388668060303, "learning_rate": 2.9654957336629815e-06, "loss": 3.6308, "step": 296 }, { "epoch": 0.23478260869565218, "grad_norm": 2.2598471641540527, "learning_rate": 2.965227739851538e-06, "loss": 3.9324, "step": 297 }, { "epoch": 0.23557312252964427, "grad_norm": 2.171093463897705, "learning_rate": 2.964958721515325e-06, "loss": 4.3492, "step": 298 }, { "epoch": 0.23636363636363636, "grad_norm": 2.1521401405334473, "learning_rate": 2.9646886788424488e-06, "loss": 4.2055, "step": 299 }, { "epoch": 0.23715415019762845, "grad_norm": 2.142150402069092, "learning_rate": 2.9644176120217297e-06, "loss": 4.2873, "step": 300 }, { "epoch": 0.23794466403162054, "grad_norm": 2.1724867820739746, "learning_rate": 2.964145521242704e-06, "loss": 4.1919, "step": 301 }, { "epoch": 0.23873517786561266, "grad_norm": 2.436922550201416, "learning_rate": 2.963872406695625e-06, "loss": 4.153, "step": 302 }, { "epoch": 0.23952569169960475, "grad_norm": 2.209285020828247, "learning_rate": 2.9635982685714613e-06, "loss": 4.0487, "step": 303 }, { "epoch": 0.24031620553359684, "grad_norm": 3.2151830196380615, "learning_rate": 2.9633231070618975e-06, "loss": 4.2059, "step": 304 }, { "epoch": 0.24110671936758893, "grad_norm": 2.114908456802368, "learning_rate": 2.963046922359333e-06, "loss": 4.2293, "step": 305 }, { "epoch": 0.24189723320158102, "grad_norm": 2.2314412593841553, "learning_rate": 2.9627697146568837e-06, "loss": 4.226, "step": 306 }, { "epoch": 0.2426877470355731, "grad_norm": 2.428211212158203, "learning_rate": 2.9624914841483803e-06, "loss": 3.8642, "step": 307 }, { "epoch": 0.24347826086956523, "grad_norm": 2.444284677505493, "learning_rate": 2.962212231028368e-06, "loss": 3.6972, "step": 308 }, { "epoch": 0.24426877470355732, "grad_norm": 2.123199224472046, "learning_rate": 2.9619319554921096e-06, "loss": 3.921, "step": 309 }, { "epoch": 0.2450592885375494, "grad_norm": 2.393693447113037, "learning_rate": 2.9616506577355787e-06, "loss": 4.2076, "step": 310 }, { "epoch": 0.2458498023715415, "grad_norm": 2.1980440616607666, "learning_rate": 2.9613683379554675e-06, "loss": 3.8318, "step": 311 }, { "epoch": 0.2466403162055336, "grad_norm": 2.172250747680664, "learning_rate": 2.96108499634918e-06, "loss": 4.0949, "step": 312 }, { "epoch": 0.24743083003952568, "grad_norm": 2.0236387252807617, "learning_rate": 2.960800633114837e-06, "loss": 4.0672, "step": 313 }, { "epoch": 0.2482213438735178, "grad_norm": 1.9811339378356934, "learning_rate": 2.9605152484512717e-06, "loss": 4.3004, "step": 314 }, { "epoch": 0.2490118577075099, "grad_norm": 2.2722394466400146, "learning_rate": 2.960228842558033e-06, "loss": 4.5285, "step": 315 }, { "epoch": 0.24980237154150198, "grad_norm": 2.080345630645752, "learning_rate": 2.959941415635383e-06, "loss": 4.1775, "step": 316 }, { "epoch": 0.25059288537549407, "grad_norm": 2.2167232036590576, "learning_rate": 2.9596529678842977e-06, "loss": 3.9275, "step": 317 }, { "epoch": 0.25138339920948616, "grad_norm": 2.0418310165405273, "learning_rate": 2.9593634995064675e-06, "loss": 4.0317, "step": 318 }, { "epoch": 0.25217391304347825, "grad_norm": 2.4662516117095947, "learning_rate": 2.959073010704296e-06, "loss": 3.8912, "step": 319 }, { "epoch": 0.25296442687747034, "grad_norm": 2.6843996047973633, "learning_rate": 2.9587815016809002e-06, "loss": 3.9472, "step": 320 }, { "epoch": 0.25375494071146243, "grad_norm": 2.19563627243042, "learning_rate": 2.9584889726401116e-06, "loss": 3.9328, "step": 321 }, { "epoch": 0.2545454545454545, "grad_norm": 2.3735034465789795, "learning_rate": 2.9581954237864726e-06, "loss": 3.9161, "step": 322 }, { "epoch": 0.25533596837944667, "grad_norm": 2.166224479675293, "learning_rate": 2.9579008553252407e-06, "loss": 4.1082, "step": 323 }, { "epoch": 0.25612648221343876, "grad_norm": 2.2177257537841797, "learning_rate": 2.9576052674623864e-06, "loss": 4.2125, "step": 324 }, { "epoch": 0.25691699604743085, "grad_norm": 2.14560866355896, "learning_rate": 2.957308660404591e-06, "loss": 4.0285, "step": 325 }, { "epoch": 0.25770750988142294, "grad_norm": 2.325580358505249, "learning_rate": 2.9570110343592506e-06, "loss": 4.0015, "step": 326 }, { "epoch": 0.258498023715415, "grad_norm": 2.14801287651062, "learning_rate": 2.956712389534473e-06, "loss": 3.9, "step": 327 }, { "epoch": 0.2592885375494071, "grad_norm": 3.63712215423584, "learning_rate": 2.956412726139078e-06, "loss": 3.7463, "step": 328 }, { "epoch": 0.2600790513833992, "grad_norm": 2.0753073692321777, "learning_rate": 2.956112044382598e-06, "loss": 4.2198, "step": 329 }, { "epoch": 0.2608695652173913, "grad_norm": 2.1820595264434814, "learning_rate": 2.9558103444752778e-06, "loss": 3.9346, "step": 330 }, { "epoch": 0.2616600790513834, "grad_norm": 2.2329859733581543, "learning_rate": 2.9555076266280726e-06, "loss": 3.8019, "step": 331 }, { "epoch": 0.2624505928853755, "grad_norm": 2.324065923690796, "learning_rate": 2.955203891052652e-06, "loss": 4.0336, "step": 332 }, { "epoch": 0.26324110671936757, "grad_norm": 2.2265467643737793, "learning_rate": 2.9548991379613943e-06, "loss": 3.9771, "step": 333 }, { "epoch": 0.26403162055335966, "grad_norm": 2.1999294757843018, "learning_rate": 2.954593367567391e-06, "loss": 4.2068, "step": 334 }, { "epoch": 0.2648221343873518, "grad_norm": 2.94905161857605, "learning_rate": 2.9542865800844456e-06, "loss": 4.0974, "step": 335 }, { "epoch": 0.2656126482213439, "grad_norm": 2.3193211555480957, "learning_rate": 2.953978775727071e-06, "loss": 3.8902, "step": 336 }, { "epoch": 0.266403162055336, "grad_norm": 2.2629597187042236, "learning_rate": 2.953669954710491e-06, "loss": 4.2598, "step": 337 }, { "epoch": 0.2671936758893281, "grad_norm": 2.2310373783111572, "learning_rate": 2.953360117250643e-06, "loss": 4.2399, "step": 338 }, { "epoch": 0.26798418972332017, "grad_norm": 2.1418302059173584, "learning_rate": 2.953049263564172e-06, "loss": 4.2144, "step": 339 }, { "epoch": 0.26877470355731226, "grad_norm": 2.395503044128418, "learning_rate": 2.952737393868435e-06, "loss": 4.1212, "step": 340 }, { "epoch": 0.26956521739130435, "grad_norm": 2.9781579971313477, "learning_rate": 2.9524245083814997e-06, "loss": 3.7647, "step": 341 }, { "epoch": 0.27035573122529644, "grad_norm": 2.126030683517456, "learning_rate": 2.9521106073221433e-06, "loss": 4.1854, "step": 342 }, { "epoch": 0.2711462450592885, "grad_norm": 2.4289512634277344, "learning_rate": 2.9517956909098537e-06, "loss": 3.7648, "step": 343 }, { "epoch": 0.2719367588932806, "grad_norm": 2.3404486179351807, "learning_rate": 2.951479759364828e-06, "loss": 4.2805, "step": 344 }, { "epoch": 0.2727272727272727, "grad_norm": 2.236771583557129, "learning_rate": 2.9511628129079753e-06, "loss": 4.1013, "step": 345 }, { "epoch": 0.2735177865612648, "grad_norm": 2.199070453643799, "learning_rate": 2.95084485176091e-06, "loss": 3.9571, "step": 346 }, { "epoch": 0.27430830039525694, "grad_norm": 2.451734781265259, "learning_rate": 2.950525876145961e-06, "loss": 3.6545, "step": 347 }, { "epoch": 0.27509881422924903, "grad_norm": 2.626018762588501, "learning_rate": 2.950205886286163e-06, "loss": 4.0504, "step": 348 }, { "epoch": 0.2758893280632411, "grad_norm": 2.6828837394714355, "learning_rate": 2.9498848824052616e-06, "loss": 3.8897, "step": 349 }, { "epoch": 0.2766798418972332, "grad_norm": 3.183534622192383, "learning_rate": 2.9495628647277102e-06, "loss": 4.0347, "step": 350 }, { "epoch": 0.2774703557312253, "grad_norm": 3.057873010635376, "learning_rate": 2.949239833478673e-06, "loss": 3.9103, "step": 351 }, { "epoch": 0.2782608695652174, "grad_norm": 2.2991318702697754, "learning_rate": 2.948915788884021e-06, "loss": 4.1954, "step": 352 }, { "epoch": 0.2790513833992095, "grad_norm": 2.3294565677642822, "learning_rate": 2.9485907311703345e-06, "loss": 3.8247, "step": 353 }, { "epoch": 0.2798418972332016, "grad_norm": 2.180840015411377, "learning_rate": 2.9482646605649027e-06, "loss": 3.9983, "step": 354 }, { "epoch": 0.28063241106719367, "grad_norm": 2.166125774383545, "learning_rate": 2.9479375772957217e-06, "loss": 4.1403, "step": 355 }, { "epoch": 0.28142292490118576, "grad_norm": 3.0945699214935303, "learning_rate": 2.9476094815914966e-06, "loss": 4.167, "step": 356 }, { "epoch": 0.28221343873517785, "grad_norm": 2.3086230754852295, "learning_rate": 2.9472803736816413e-06, "loss": 4.079, "step": 357 }, { "epoch": 0.28300395256916994, "grad_norm": 2.5135653018951416, "learning_rate": 2.946950253796275e-06, "loss": 4.1054, "step": 358 }, { "epoch": 0.2837944664031621, "grad_norm": 2.1529886722564697, "learning_rate": 2.9466191221662277e-06, "loss": 3.9313, "step": 359 }, { "epoch": 0.2845849802371542, "grad_norm": 2.4047141075134277, "learning_rate": 2.9462869790230333e-06, "loss": 3.8398, "step": 360 }, { "epoch": 0.28537549407114626, "grad_norm": 2.1676783561706543, "learning_rate": 2.945953824598936e-06, "loss": 4.0973, "step": 361 }, { "epoch": 0.28616600790513835, "grad_norm": 2.663418769836426, "learning_rate": 2.9456196591268856e-06, "loss": 3.6732, "step": 362 }, { "epoch": 0.28695652173913044, "grad_norm": 2.0317816734313965, "learning_rate": 2.9452844828405394e-06, "loss": 4.0811, "step": 363 }, { "epoch": 0.28774703557312253, "grad_norm": 2.047367572784424, "learning_rate": 2.944948295974261e-06, "loss": 4.2356, "step": 364 }, { "epoch": 0.2885375494071146, "grad_norm": 2.3424441814422607, "learning_rate": 2.9446110987631207e-06, "loss": 3.6345, "step": 365 }, { "epoch": 0.2893280632411067, "grad_norm": 2.3934285640716553, "learning_rate": 2.9442728914428953e-06, "loss": 4.1244, "step": 366 }, { "epoch": 0.2901185770750988, "grad_norm": 2.309551954269409, "learning_rate": 2.9439336742500685e-06, "loss": 3.9674, "step": 367 }, { "epoch": 0.2909090909090909, "grad_norm": 2.232842445373535, "learning_rate": 2.9435934474218297e-06, "loss": 3.7829, "step": 368 }, { "epoch": 0.291699604743083, "grad_norm": 2.4629220962524414, "learning_rate": 2.943252211196074e-06, "loss": 4.0659, "step": 369 }, { "epoch": 0.2924901185770751, "grad_norm": 2.7686119079589844, "learning_rate": 2.9429099658114024e-06, "loss": 4.143, "step": 370 }, { "epoch": 0.2932806324110672, "grad_norm": 2.8743598461151123, "learning_rate": 2.9425667115071216e-06, "loss": 4.3775, "step": 371 }, { "epoch": 0.2940711462450593, "grad_norm": 2.390409469604492, "learning_rate": 2.9422224485232437e-06, "loss": 4.0211, "step": 372 }, { "epoch": 0.2948616600790514, "grad_norm": 1.9424328804016113, "learning_rate": 2.9418771771004868e-06, "loss": 4.3599, "step": 373 }, { "epoch": 0.2956521739130435, "grad_norm": 3.2294180393218994, "learning_rate": 2.9415308974802726e-06, "loss": 3.8704, "step": 374 }, { "epoch": 0.2964426877470356, "grad_norm": 2.0203115940093994, "learning_rate": 2.9411836099047303e-06, "loss": 4.3346, "step": 375 }, { "epoch": 0.2972332015810277, "grad_norm": 2.2469444274902344, "learning_rate": 2.9408353146166902e-06, "loss": 3.8944, "step": 376 }, { "epoch": 0.29802371541501976, "grad_norm": 2.6451609134674072, "learning_rate": 2.9404860118596904e-06, "loss": 4.0239, "step": 377 }, { "epoch": 0.29881422924901185, "grad_norm": 2.5109405517578125, "learning_rate": 2.9401357018779728e-06, "loss": 4.0913, "step": 378 }, { "epoch": 0.29960474308300394, "grad_norm": 2.2692067623138428, "learning_rate": 2.9397843849164823e-06, "loss": 3.8418, "step": 379 }, { "epoch": 0.30039525691699603, "grad_norm": 2.441124439239502, "learning_rate": 2.9394320612208684e-06, "loss": 4.1416, "step": 380 }, { "epoch": 0.3011857707509881, "grad_norm": 2.3199639320373535, "learning_rate": 2.9390787310374863e-06, "loss": 3.8529, "step": 381 }, { "epoch": 0.3019762845849802, "grad_norm": 2.204930543899536, "learning_rate": 2.938724394613392e-06, "loss": 4.301, "step": 382 }, { "epoch": 0.3027667984189723, "grad_norm": 2.4493072032928467, "learning_rate": 2.938369052196347e-06, "loss": 4.248, "step": 383 }, { "epoch": 0.30355731225296445, "grad_norm": 2.7374508380889893, "learning_rate": 2.938012704034817e-06, "loss": 4.2853, "step": 384 }, { "epoch": 0.30434782608695654, "grad_norm": 2.9343221187591553, "learning_rate": 2.937655350377968e-06, "loss": 3.8135, "step": 385 }, { "epoch": 0.30513833992094863, "grad_norm": 2.1325559616088867, "learning_rate": 2.937296991475672e-06, "loss": 4.1011, "step": 386 }, { "epoch": 0.3059288537549407, "grad_norm": 2.148656129837036, "learning_rate": 2.936937627578502e-06, "loss": 4.2256, "step": 387 }, { "epoch": 0.3067193675889328, "grad_norm": 2.2726407051086426, "learning_rate": 2.936577258937735e-06, "loss": 3.8808, "step": 388 }, { "epoch": 0.3075098814229249, "grad_norm": 2.187255620956421, "learning_rate": 2.9362158858053495e-06, "loss": 4.0042, "step": 389 }, { "epoch": 0.308300395256917, "grad_norm": 2.0763821601867676, "learning_rate": 2.9358535084340278e-06, "loss": 4.1877, "step": 390 }, { "epoch": 0.3090909090909091, "grad_norm": 2.4884285926818848, "learning_rate": 2.9354901270771524e-06, "loss": 4.1366, "step": 391 }, { "epoch": 0.3098814229249012, "grad_norm": 2.465118646621704, "learning_rate": 2.935125741988809e-06, "loss": 3.8732, "step": 392 }, { "epoch": 0.31067193675889326, "grad_norm": 2.6948087215423584, "learning_rate": 2.9347603534237857e-06, "loss": 4.0674, "step": 393 }, { "epoch": 0.31146245059288535, "grad_norm": 2.4577503204345703, "learning_rate": 2.934393961637571e-06, "loss": 3.8522, "step": 394 }, { "epoch": 0.31225296442687744, "grad_norm": 2.2810049057006836, "learning_rate": 2.9340265668863554e-06, "loss": 3.8681, "step": 395 }, { "epoch": 0.3130434782608696, "grad_norm": 2.1061129570007324, "learning_rate": 2.9336581694270306e-06, "loss": 4.0885, "step": 396 }, { "epoch": 0.3138339920948617, "grad_norm": 2.1035377979278564, "learning_rate": 2.9332887695171896e-06, "loss": 4.1419, "step": 397 }, { "epoch": 0.31462450592885377, "grad_norm": 2.4450507164001465, "learning_rate": 2.932918367415127e-06, "loss": 4.0659, "step": 398 }, { "epoch": 0.31541501976284586, "grad_norm": 2.6997838020324707, "learning_rate": 2.9325469633798363e-06, "loss": 3.8938, "step": 399 }, { "epoch": 0.31620553359683795, "grad_norm": 1.9854880571365356, "learning_rate": 2.932174557671014e-06, "loss": 4.4503, "step": 400 }, { "epoch": 0.31699604743083004, "grad_norm": 3.717038869857788, "learning_rate": 2.931801150549054e-06, "loss": 3.954, "step": 401 }, { "epoch": 0.31778656126482213, "grad_norm": 2.119933605194092, "learning_rate": 2.931426742275054e-06, "loss": 3.9531, "step": 402 }, { "epoch": 0.3185770750988142, "grad_norm": 2.061952590942383, "learning_rate": 2.9310513331108086e-06, "loss": 3.8779, "step": 403 }, { "epoch": 0.3193675889328063, "grad_norm": 2.632378339767456, "learning_rate": 2.9306749233188142e-06, "loss": 4.0705, "step": 404 }, { "epoch": 0.3201581027667984, "grad_norm": 2.411524534225464, "learning_rate": 2.930297513162265e-06, "loss": 3.5213, "step": 405 }, { "epoch": 0.3209486166007905, "grad_norm": 2.0688092708587646, "learning_rate": 2.9299191029050574e-06, "loss": 4.1069, "step": 406 }, { "epoch": 0.3217391304347826, "grad_norm": 2.195094347000122, "learning_rate": 2.9295396928117854e-06, "loss": 4.0906, "step": 407 }, { "epoch": 0.32252964426877473, "grad_norm": 2.102722644805908, "learning_rate": 2.929159283147741e-06, "loss": 4.2583, "step": 408 }, { "epoch": 0.3233201581027668, "grad_norm": 2.0772087574005127, "learning_rate": 2.9287778741789165e-06, "loss": 4.1772, "step": 409 }, { "epoch": 0.3241106719367589, "grad_norm": 2.1864607334136963, "learning_rate": 2.928395466172004e-06, "loss": 4.2954, "step": 410 }, { "epoch": 0.324901185770751, "grad_norm": 2.0889270305633545, "learning_rate": 2.9280120593943925e-06, "loss": 4.156, "step": 411 }, { "epoch": 0.3256916996047431, "grad_norm": 2.072766065597534, "learning_rate": 2.9276276541141693e-06, "loss": 4.3246, "step": 412 }, { "epoch": 0.3264822134387352, "grad_norm": 2.333378791809082, "learning_rate": 2.9272422506001214e-06, "loss": 3.9594, "step": 413 }, { "epoch": 0.32727272727272727, "grad_norm": 2.195173978805542, "learning_rate": 2.926855849121732e-06, "loss": 4.1296, "step": 414 }, { "epoch": 0.32806324110671936, "grad_norm": 2.35520339012146, "learning_rate": 2.926468449949183e-06, "loss": 3.823, "step": 415 }, { "epoch": 0.32885375494071145, "grad_norm": 2.463986396789551, "learning_rate": 2.926080053353354e-06, "loss": 4.0314, "step": 416 }, { "epoch": 0.32964426877470354, "grad_norm": 2.1926181316375732, "learning_rate": 2.925690659605822e-06, "loss": 3.9473, "step": 417 }, { "epoch": 0.33043478260869563, "grad_norm": 2.5609772205352783, "learning_rate": 2.9253002689788604e-06, "loss": 4.0796, "step": 418 }, { "epoch": 0.3312252964426877, "grad_norm": 2.768385410308838, "learning_rate": 2.9249088817454416e-06, "loss": 3.8379, "step": 419 }, { "epoch": 0.33201581027667987, "grad_norm": 2.336761713027954, "learning_rate": 2.924516498179232e-06, "loss": 4.2201, "step": 420 }, { "epoch": 0.33280632411067196, "grad_norm": 2.2262954711914062, "learning_rate": 2.9241231185545977e-06, "loss": 3.9189, "step": 421 }, { "epoch": 0.33359683794466405, "grad_norm": 2.5179741382598877, "learning_rate": 2.9237287431465988e-06, "loss": 3.9803, "step": 422 }, { "epoch": 0.33438735177865614, "grad_norm": 2.122593641281128, "learning_rate": 2.923333372230993e-06, "loss": 4.2405, "step": 423 }, { "epoch": 0.33517786561264823, "grad_norm": 2.2024760246276855, "learning_rate": 2.9229370060842342e-06, "loss": 4.3206, "step": 424 }, { "epoch": 0.3359683794466403, "grad_norm": 2.1568984985351562, "learning_rate": 2.9225396449834715e-06, "loss": 4.1363, "step": 425 }, { "epoch": 0.3367588932806324, "grad_norm": 2.137089967727661, "learning_rate": 2.922141289206549e-06, "loss": 4.0702, "step": 426 }, { "epoch": 0.3375494071146245, "grad_norm": 2.156400203704834, "learning_rate": 2.921741939032009e-06, "loss": 3.9337, "step": 427 }, { "epoch": 0.3383399209486166, "grad_norm": 2.1902549266815186, "learning_rate": 2.921341594739086e-06, "loss": 4.1108, "step": 428 }, { "epoch": 0.3391304347826087, "grad_norm": 3.074528217315674, "learning_rate": 2.9209402566077116e-06, "loss": 3.4984, "step": 429 }, { "epoch": 0.33992094861660077, "grad_norm": 2.4137206077575684, "learning_rate": 2.9205379249185117e-06, "loss": 3.9871, "step": 430 }, { "epoch": 0.34071146245059286, "grad_norm": 2.216160297393799, "learning_rate": 2.9201345999528062e-06, "loss": 4.1161, "step": 431 }, { "epoch": 0.341501976284585, "grad_norm": 1.9848891496658325, "learning_rate": 2.919730281992612e-06, "loss": 4.1565, "step": 432 }, { "epoch": 0.3422924901185771, "grad_norm": 2.318152904510498, "learning_rate": 2.919324971320636e-06, "loss": 3.9315, "step": 433 }, { "epoch": 0.3430830039525692, "grad_norm": 2.092583656311035, "learning_rate": 2.918918668220284e-06, "loss": 4.0125, "step": 434 }, { "epoch": 0.3438735177865613, "grad_norm": 2.4105722904205322, "learning_rate": 2.918511372975653e-06, "loss": 3.9092, "step": 435 }, { "epoch": 0.34466403162055337, "grad_norm": 2.0704407691955566, "learning_rate": 2.918103085871534e-06, "loss": 4.1533, "step": 436 }, { "epoch": 0.34545454545454546, "grad_norm": 2.2294363975524902, "learning_rate": 2.917693807193411e-06, "loss": 3.9539, "step": 437 }, { "epoch": 0.34624505928853755, "grad_norm": 2.070502281188965, "learning_rate": 2.917283537227464e-06, "loss": 4.1527, "step": 438 }, { "epoch": 0.34703557312252964, "grad_norm": 2.179309606552124, "learning_rate": 2.9168722762605624e-06, "loss": 4.062, "step": 439 }, { "epoch": 0.34782608695652173, "grad_norm": 2.419806480407715, "learning_rate": 2.9164600245802716e-06, "loss": 3.5642, "step": 440 }, { "epoch": 0.3486166007905138, "grad_norm": 2.2101476192474365, "learning_rate": 2.916046782474848e-06, "loss": 4.0622, "step": 441 }, { "epoch": 0.3494071146245059, "grad_norm": 2.0172064304351807, "learning_rate": 2.9156325502332413e-06, "loss": 4.4362, "step": 442 }, { "epoch": 0.350197628458498, "grad_norm": 2.294893264770508, "learning_rate": 2.915217328145093e-06, "loss": 3.9718, "step": 443 }, { "epoch": 0.35098814229249015, "grad_norm": 2.1414718627929688, "learning_rate": 2.914801116500738e-06, "loss": 4.156, "step": 444 }, { "epoch": 0.35177865612648224, "grad_norm": 2.2105367183685303, "learning_rate": 2.914383915591201e-06, "loss": 4.01, "step": 445 }, { "epoch": 0.3525691699604743, "grad_norm": 2.217642068862915, "learning_rate": 2.9139657257081996e-06, "loss": 3.7965, "step": 446 }, { "epoch": 0.3533596837944664, "grad_norm": 2.3136138916015625, "learning_rate": 2.9135465471441435e-06, "loss": 4.0248, "step": 447 }, { "epoch": 0.3541501976284585, "grad_norm": 2.3293564319610596, "learning_rate": 2.9131263801921326e-06, "loss": 4.1393, "step": 448 }, { "epoch": 0.3549407114624506, "grad_norm": 1.9660755395889282, "learning_rate": 2.912705225145959e-06, "loss": 4.3857, "step": 449 }, { "epoch": 0.3557312252964427, "grad_norm": 2.1146445274353027, "learning_rate": 2.9122830823001045e-06, "loss": 4.1328, "step": 450 }, { "epoch": 0.3565217391304348, "grad_norm": 2.2180840969085693, "learning_rate": 2.9118599519497426e-06, "loss": 3.8163, "step": 451 }, { "epoch": 0.35731225296442687, "grad_norm": 1.9956990480422974, "learning_rate": 2.9114358343907363e-06, "loss": 4.4452, "step": 452 }, { "epoch": 0.35810276679841896, "grad_norm": 2.141120195388794, "learning_rate": 2.91101072991964e-06, "loss": 3.976, "step": 453 }, { "epoch": 0.35889328063241105, "grad_norm": 2.5643162727355957, "learning_rate": 2.9105846388336978e-06, "loss": 4.1803, "step": 454 }, { "epoch": 0.35968379446640314, "grad_norm": 2.065150022506714, "learning_rate": 2.9101575614308423e-06, "loss": 4.3223, "step": 455 }, { "epoch": 0.36047430830039523, "grad_norm": 2.2472190856933594, "learning_rate": 2.909729498009699e-06, "loss": 3.7165, "step": 456 }, { "epoch": 0.3612648221343874, "grad_norm": 2.259584903717041, "learning_rate": 2.909300448869579e-06, "loss": 3.8129, "step": 457 }, { "epoch": 0.36205533596837947, "grad_norm": 2.3634192943573, "learning_rate": 2.9088704143104853e-06, "loss": 3.8419, "step": 458 }, { "epoch": 0.36284584980237156, "grad_norm": 2.095984935760498, "learning_rate": 2.9084393946331084e-06, "loss": 4.2767, "step": 459 }, { "epoch": 0.36363636363636365, "grad_norm": 2.2537519931793213, "learning_rate": 2.908007390138829e-06, "loss": 3.8808, "step": 460 }, { "epoch": 0.36442687747035574, "grad_norm": 2.1148931980133057, "learning_rate": 2.907574401129716e-06, "loss": 4.2221, "step": 461 }, { "epoch": 0.3652173913043478, "grad_norm": 2.477668285369873, "learning_rate": 2.9071404279085253e-06, "loss": 4.1081, "step": 462 }, { "epoch": 0.3660079051383399, "grad_norm": 2.2015929222106934, "learning_rate": 2.906705470778703e-06, "loss": 3.6955, "step": 463 }, { "epoch": 0.366798418972332, "grad_norm": 2.797746419906616, "learning_rate": 2.906269530044382e-06, "loss": 4.1443, "step": 464 }, { "epoch": 0.3675889328063241, "grad_norm": 2.0167019367218018, "learning_rate": 2.9058326060103826e-06, "loss": 4.1757, "step": 465 }, { "epoch": 0.3683794466403162, "grad_norm": 2.0686490535736084, "learning_rate": 2.9053946989822145e-06, "loss": 4.1266, "step": 466 }, { "epoch": 0.3691699604743083, "grad_norm": 1.9584245681762695, "learning_rate": 2.9049558092660728e-06, "loss": 4.4474, "step": 467 }, { "epoch": 0.36996047430830037, "grad_norm": 1.9351991415023804, "learning_rate": 2.9045159371688403e-06, "loss": 4.5262, "step": 468 }, { "epoch": 0.3707509881422925, "grad_norm": 2.696094512939453, "learning_rate": 2.9040750829980868e-06, "loss": 3.8396, "step": 469 }, { "epoch": 0.3715415019762846, "grad_norm": 2.300950765609741, "learning_rate": 2.9036332470620693e-06, "loss": 3.9987, "step": 470 }, { "epoch": 0.3723320158102767, "grad_norm": 2.1039538383483887, "learning_rate": 2.903190429669731e-06, "loss": 4.1906, "step": 471 }, { "epoch": 0.3731225296442688, "grad_norm": 2.1419413089752197, "learning_rate": 2.9027466311307e-06, "loss": 4.2371, "step": 472 }, { "epoch": 0.3739130434782609, "grad_norm": 2.0779643058776855, "learning_rate": 2.902301851755293e-06, "loss": 4.1407, "step": 473 }, { "epoch": 0.37470355731225297, "grad_norm": 2.2326269149780273, "learning_rate": 2.90185609185451e-06, "loss": 3.9922, "step": 474 }, { "epoch": 0.37549407114624506, "grad_norm": 2.1206722259521484, "learning_rate": 2.9014093517400387e-06, "loss": 4.2148, "step": 475 }, { "epoch": 0.37628458498023715, "grad_norm": 2.223644733428955, "learning_rate": 2.9009616317242505e-06, "loss": 3.9281, "step": 476 }, { "epoch": 0.37707509881422924, "grad_norm": 2.6362595558166504, "learning_rate": 2.900512932120203e-06, "loss": 3.954, "step": 477 }, { "epoch": 0.3778656126482213, "grad_norm": 1.9515467882156372, "learning_rate": 2.900063253241638e-06, "loss": 4.3754, "step": 478 }, { "epoch": 0.3786561264822134, "grad_norm": 2.0047831535339355, "learning_rate": 2.8996125954029833e-06, "loss": 4.2346, "step": 479 }, { "epoch": 0.3794466403162055, "grad_norm": 2.131376266479492, "learning_rate": 2.8991609589193502e-06, "loss": 4.1598, "step": 480 }, { "epoch": 0.38023715415019765, "grad_norm": 1.989701747894287, "learning_rate": 2.8987083441065335e-06, "loss": 4.175, "step": 481 }, { "epoch": 0.38102766798418974, "grad_norm": 2.426793098449707, "learning_rate": 2.8982547512810148e-06, "loss": 4.3347, "step": 482 }, { "epoch": 0.38181818181818183, "grad_norm": 2.571873664855957, "learning_rate": 2.8978001807599565e-06, "loss": 4.1487, "step": 483 }, { "epoch": 0.3826086956521739, "grad_norm": 2.160423994064331, "learning_rate": 2.8973446328612062e-06, "loss": 3.8804, "step": 484 }, { "epoch": 0.383399209486166, "grad_norm": 2.2608261108398438, "learning_rate": 2.896888107903295e-06, "loss": 4.2287, "step": 485 }, { "epoch": 0.3841897233201581, "grad_norm": 2.1053433418273926, "learning_rate": 2.8964306062054366e-06, "loss": 3.999, "step": 486 }, { "epoch": 0.3849802371541502, "grad_norm": 2.152895212173462, "learning_rate": 2.8959721280875277e-06, "loss": 3.7443, "step": 487 }, { "epoch": 0.3857707509881423, "grad_norm": 2.2211251258850098, "learning_rate": 2.8955126738701487e-06, "loss": 3.7424, "step": 488 }, { "epoch": 0.3865612648221344, "grad_norm": 2.1291894912719727, "learning_rate": 2.8950522438745608e-06, "loss": 4.1102, "step": 489 }, { "epoch": 0.38735177865612647, "grad_norm": 2.508066415786743, "learning_rate": 2.894590838422709e-06, "loss": 4.2085, "step": 490 }, { "epoch": 0.38814229249011856, "grad_norm": 2.633690118789673, "learning_rate": 2.89412845783722e-06, "loss": 3.995, "step": 491 }, { "epoch": 0.38893280632411065, "grad_norm": 2.214195489883423, "learning_rate": 2.8936651024414018e-06, "loss": 4.2054, "step": 492 }, { "epoch": 0.3897233201581028, "grad_norm": 2.9051496982574463, "learning_rate": 2.8932007725592445e-06, "loss": 4.4206, "step": 493 }, { "epoch": 0.3905138339920949, "grad_norm": 2.125666379928589, "learning_rate": 2.892735468515419e-06, "loss": 4.1032, "step": 494 }, { "epoch": 0.391304347826087, "grad_norm": 3.1279778480529785, "learning_rate": 2.8922691906352786e-06, "loss": 3.5936, "step": 495 }, { "epoch": 0.39209486166007906, "grad_norm": 2.0520710945129395, "learning_rate": 2.891801939244856e-06, "loss": 4.2245, "step": 496 }, { "epoch": 0.39288537549407115, "grad_norm": 2.063340425491333, "learning_rate": 2.8913337146708656e-06, "loss": 4.1161, "step": 497 }, { "epoch": 0.39367588932806324, "grad_norm": 2.249211311340332, "learning_rate": 2.8908645172407023e-06, "loss": 4.0008, "step": 498 }, { "epoch": 0.39446640316205533, "grad_norm": 2.1570396423339844, "learning_rate": 2.890394347282441e-06, "loss": 4.1135, "step": 499 }, { "epoch": 0.3952569169960474, "grad_norm": 2.2053334712982178, "learning_rate": 2.889923205124836e-06, "loss": 3.824, "step": 500 }, { "epoch": 0.3960474308300395, "grad_norm": 2.4537079334259033, "learning_rate": 2.889451091097322e-06, "loss": 3.8982, "step": 501 }, { "epoch": 0.3968379446640316, "grad_norm": 2.3981363773345947, "learning_rate": 2.888978005530014e-06, "loss": 4.1533, "step": 502 }, { "epoch": 0.3976284584980237, "grad_norm": 2.278127670288086, "learning_rate": 2.8885039487537046e-06, "loss": 4.1428, "step": 503 }, { "epoch": 0.3984189723320158, "grad_norm": 2.4175944328308105, "learning_rate": 2.888028921099867e-06, "loss": 4.0581, "step": 504 }, { "epoch": 0.39920948616600793, "grad_norm": 2.0424671173095703, "learning_rate": 2.887552922900653e-06, "loss": 4.1329, "step": 505 }, { "epoch": 0.4, "grad_norm": 2.365112066268921, "learning_rate": 2.8870759544888923e-06, "loss": 4.1664, "step": 506 }, { "epoch": 0.4007905138339921, "grad_norm": 2.49619460105896, "learning_rate": 2.886598016198093e-06, "loss": 3.8395, "step": 507 }, { "epoch": 0.4015810276679842, "grad_norm": 2.11897873878479, "learning_rate": 2.8861191083624425e-06, "loss": 4.0448, "step": 508 }, { "epoch": 0.4023715415019763, "grad_norm": 2.3039608001708984, "learning_rate": 2.885639231316805e-06, "loss": 3.8567, "step": 509 }, { "epoch": 0.4031620553359684, "grad_norm": 2.291313886642456, "learning_rate": 2.885158385396723e-06, "loss": 4.1056, "step": 510 }, { "epoch": 0.4039525691699605, "grad_norm": 2.6800289154052734, "learning_rate": 2.884676570938417e-06, "loss": 4.0601, "step": 511 }, { "epoch": 0.40474308300395256, "grad_norm": 2.1854002475738525, "learning_rate": 2.884193788278783e-06, "loss": 4.1887, "step": 512 }, { "epoch": 0.40553359683794465, "grad_norm": 2.411978244781494, "learning_rate": 2.883710037755395e-06, "loss": 3.989, "step": 513 }, { "epoch": 0.40632411067193674, "grad_norm": 2.1253721714019775, "learning_rate": 2.8832253197065036e-06, "loss": 3.875, "step": 514 }, { "epoch": 0.40711462450592883, "grad_norm": 2.2550952434539795, "learning_rate": 2.8827396344710375e-06, "loss": 3.8109, "step": 515 }, { "epoch": 0.4079051383399209, "grad_norm": 2.129207134246826, "learning_rate": 2.8822529823885986e-06, "loss": 4.1962, "step": 516 }, { "epoch": 0.40869565217391307, "grad_norm": 2.4510416984558105, "learning_rate": 2.8817653637994677e-06, "loss": 3.9707, "step": 517 }, { "epoch": 0.40948616600790516, "grad_norm": 2.5612502098083496, "learning_rate": 2.8812767790445992e-06, "loss": 4.0335, "step": 518 }, { "epoch": 0.41027667984189725, "grad_norm": 2.147244453430176, "learning_rate": 2.8807872284656253e-06, "loss": 4.1445, "step": 519 }, { "epoch": 0.41106719367588934, "grad_norm": 2.5615427494049072, "learning_rate": 2.880296712404851e-06, "loss": 3.9153, "step": 520 }, { "epoch": 0.41185770750988143, "grad_norm": 2.027977228164673, "learning_rate": 2.8798052312052585e-06, "loss": 4.1996, "step": 521 }, { "epoch": 0.4126482213438735, "grad_norm": 2.177217960357666, "learning_rate": 2.8793127852105036e-06, "loss": 4.1113, "step": 522 }, { "epoch": 0.4134387351778656, "grad_norm": 2.2618627548217773, "learning_rate": 2.878819374764918e-06, "loss": 4.0501, "step": 523 }, { "epoch": 0.4142292490118577, "grad_norm": 2.374138832092285, "learning_rate": 2.8783250002135053e-06, "loss": 4.2767, "step": 524 }, { "epoch": 0.4150197628458498, "grad_norm": 2.475831985473633, "learning_rate": 2.8778296619019466e-06, "loss": 3.9448, "step": 525 }, { "epoch": 0.4158102766798419, "grad_norm": 2.3944785594940186, "learning_rate": 2.8773333601765944e-06, "loss": 4.3203, "step": 526 }, { "epoch": 0.416600790513834, "grad_norm": 2.3196866512298584, "learning_rate": 2.8768360953844753e-06, "loss": 4.0916, "step": 527 }, { "epoch": 0.41739130434782606, "grad_norm": 2.2608859539031982, "learning_rate": 2.8763378678732903e-06, "loss": 3.8965, "step": 528 }, { "epoch": 0.41818181818181815, "grad_norm": 2.0901129245758057, "learning_rate": 2.8758386779914115e-06, "loss": 4.1454, "step": 529 }, { "epoch": 0.4189723320158103, "grad_norm": 2.7267236709594727, "learning_rate": 2.875338526087887e-06, "loss": 3.9135, "step": 530 }, { "epoch": 0.4197628458498024, "grad_norm": 2.39241623878479, "learning_rate": 2.8748374125124348e-06, "loss": 3.8369, "step": 531 }, { "epoch": 0.4205533596837945, "grad_norm": 2.253249168395996, "learning_rate": 2.8743353376154456e-06, "loss": 4.0883, "step": 532 }, { "epoch": 0.42134387351778657, "grad_norm": 2.3943662643432617, "learning_rate": 2.8738323017479852e-06, "loss": 3.4587, "step": 533 }, { "epoch": 0.42213438735177866, "grad_norm": 2.360029458999634, "learning_rate": 2.873328305261787e-06, "loss": 4.029, "step": 534 }, { "epoch": 0.42292490118577075, "grad_norm": 2.21586275100708, "learning_rate": 2.8728233485092593e-06, "loss": 3.7316, "step": 535 }, { "epoch": 0.42371541501976284, "grad_norm": 2.085630178451538, "learning_rate": 2.8723174318434816e-06, "loss": 4.1587, "step": 536 }, { "epoch": 0.42450592885375493, "grad_norm": 2.1470534801483154, "learning_rate": 2.8718105556182015e-06, "loss": 4.1076, "step": 537 }, { "epoch": 0.425296442687747, "grad_norm": 1.9789143800735474, "learning_rate": 2.871302720187842e-06, "loss": 4.0875, "step": 538 }, { "epoch": 0.4260869565217391, "grad_norm": 2.4314351081848145, "learning_rate": 2.8707939259074937e-06, "loss": 4.0251, "step": 539 }, { "epoch": 0.4268774703557312, "grad_norm": 1.9909188747406006, "learning_rate": 2.870284173132919e-06, "loss": 4.296, "step": 540 }, { "epoch": 0.4276679841897233, "grad_norm": 2.089538335800171, "learning_rate": 2.8697734622205493e-06, "loss": 4.0827, "step": 541 }, { "epoch": 0.42845849802371544, "grad_norm": 2.420201539993286, "learning_rate": 2.8692617935274872e-06, "loss": 4.2443, "step": 542 }, { "epoch": 0.42924901185770753, "grad_norm": 2.378854751586914, "learning_rate": 2.868749167411505e-06, "loss": 3.6061, "step": 543 }, { "epoch": 0.4300395256916996, "grad_norm": 2.487191677093506, "learning_rate": 2.8682355842310437e-06, "loss": 3.8189, "step": 544 }, { "epoch": 0.4308300395256917, "grad_norm": 2.042949676513672, "learning_rate": 2.8677210443452136e-06, "loss": 4.25, "step": 545 }, { "epoch": 0.4316205533596838, "grad_norm": 1.9910869598388672, "learning_rate": 2.8672055481137943e-06, "loss": 4.4313, "step": 546 }, { "epoch": 0.4324110671936759, "grad_norm": 2.315190315246582, "learning_rate": 2.8666890958972336e-06, "loss": 3.8977, "step": 547 }, { "epoch": 0.433201581027668, "grad_norm": 2.2538933753967285, "learning_rate": 2.866171688056649e-06, "loss": 4.0491, "step": 548 }, { "epoch": 0.43399209486166007, "grad_norm": 2.428513765335083, "learning_rate": 2.8656533249538248e-06, "loss": 3.8857, "step": 549 }, { "epoch": 0.43478260869565216, "grad_norm": 2.205817937850952, "learning_rate": 2.8651340069512136e-06, "loss": 3.9595, "step": 550 }, { "epoch": 0.43557312252964425, "grad_norm": 3.7407803535461426, "learning_rate": 2.864613734411936e-06, "loss": 3.9792, "step": 551 }, { "epoch": 0.43636363636363634, "grad_norm": 2.0253074169158936, "learning_rate": 2.8640925076997804e-06, "loss": 4.3262, "step": 552 }, { "epoch": 0.43715415019762843, "grad_norm": 2.5751585960388184, "learning_rate": 2.863570327179201e-06, "loss": 3.555, "step": 553 }, { "epoch": 0.4379446640316206, "grad_norm": 2.182938814163208, "learning_rate": 2.8630471932153204e-06, "loss": 3.8849, "step": 554 }, { "epoch": 0.43873517786561267, "grad_norm": 3.5766892433166504, "learning_rate": 2.8625231061739267e-06, "loss": 4.046, "step": 555 }, { "epoch": 0.43952569169960476, "grad_norm": 2.0928170680999756, "learning_rate": 2.8619980664214754e-06, "loss": 4.2062, "step": 556 }, { "epoch": 0.44031620553359685, "grad_norm": 1.9364449977874756, "learning_rate": 2.861472074325088e-06, "loss": 4.5029, "step": 557 }, { "epoch": 0.44110671936758894, "grad_norm": 2.2682929039001465, "learning_rate": 2.860945130252551e-06, "loss": 4.1043, "step": 558 }, { "epoch": 0.44189723320158103, "grad_norm": 2.3232483863830566, "learning_rate": 2.860417234572318e-06, "loss": 4.0722, "step": 559 }, { "epoch": 0.4426877470355731, "grad_norm": 2.5432333946228027, "learning_rate": 2.859888387653506e-06, "loss": 3.6222, "step": 560 }, { "epoch": 0.4434782608695652, "grad_norm": 2.123521089553833, "learning_rate": 2.8593585898659e-06, "loss": 4.0861, "step": 561 }, { "epoch": 0.4442687747035573, "grad_norm": 2.466269016265869, "learning_rate": 2.8588278415799467e-06, "loss": 3.9939, "step": 562 }, { "epoch": 0.4450592885375494, "grad_norm": 2.5184695720672607, "learning_rate": 2.858296143166759e-06, "loss": 4.0648, "step": 563 }, { "epoch": 0.4458498023715415, "grad_norm": 2.224595785140991, "learning_rate": 2.8577634949981143e-06, "loss": 4.1662, "step": 564 }, { "epoch": 0.44664031620553357, "grad_norm": 2.040050983428955, "learning_rate": 2.857229897446455e-06, "loss": 4.1292, "step": 565 }, { "epoch": 0.4474308300395257, "grad_norm": 2.1761343479156494, "learning_rate": 2.8566953508848843e-06, "loss": 4.1809, "step": 566 }, { "epoch": 0.4482213438735178, "grad_norm": 2.1411030292510986, "learning_rate": 2.856159855687172e-06, "loss": 4.0548, "step": 567 }, { "epoch": 0.4490118577075099, "grad_norm": 2.3919944763183594, "learning_rate": 2.85562341222775e-06, "loss": 3.844, "step": 568 }, { "epoch": 0.449802371541502, "grad_norm": 3.1844711303710938, "learning_rate": 2.855086020881713e-06, "loss": 3.7819, "step": 569 }, { "epoch": 0.4505928853754941, "grad_norm": 2.054130792617798, "learning_rate": 2.854547682024819e-06, "loss": 4.2051, "step": 570 }, { "epoch": 0.45138339920948617, "grad_norm": 2.568441152572632, "learning_rate": 2.854008396033488e-06, "loss": 3.8052, "step": 571 }, { "epoch": 0.45217391304347826, "grad_norm": 2.2760677337646484, "learning_rate": 2.8534681632848027e-06, "loss": 4.1044, "step": 572 }, { "epoch": 0.45296442687747035, "grad_norm": 2.3700313568115234, "learning_rate": 2.8529269841565074e-06, "loss": 4.1633, "step": 573 }, { "epoch": 0.45375494071146244, "grad_norm": 2.96272349357605, "learning_rate": 2.8523848590270096e-06, "loss": 4.035, "step": 574 }, { "epoch": 0.45454545454545453, "grad_norm": 2.7959275245666504, "learning_rate": 2.851841788275376e-06, "loss": 4.3472, "step": 575 }, { "epoch": 0.4553359683794466, "grad_norm": 2.1788551807403564, "learning_rate": 2.8512977722813357e-06, "loss": 3.9651, "step": 576 }, { "epoch": 0.4561264822134387, "grad_norm": 2.113039970397949, "learning_rate": 2.8507528114252786e-06, "loss": 3.7786, "step": 577 }, { "epoch": 0.45691699604743086, "grad_norm": 2.322596549987793, "learning_rate": 2.850206906088255e-06, "loss": 4.4326, "step": 578 }, { "epoch": 0.45770750988142295, "grad_norm": 2.1326956748962402, "learning_rate": 2.8496600566519773e-06, "loss": 4.1485, "step": 579 }, { "epoch": 0.45849802371541504, "grad_norm": 2.0709495544433594, "learning_rate": 2.8491122634988152e-06, "loss": 4.1606, "step": 580 }, { "epoch": 0.4592885375494071, "grad_norm": 2.798429250717163, "learning_rate": 2.8485635270118e-06, "loss": 4.1043, "step": 581 }, { "epoch": 0.4600790513833992, "grad_norm": 2.8179612159729004, "learning_rate": 2.848013847574623e-06, "loss": 3.8873, "step": 582 }, { "epoch": 0.4608695652173913, "grad_norm": 2.6646182537078857, "learning_rate": 2.847463225571633e-06, "loss": 4.0087, "step": 583 }, { "epoch": 0.4616600790513834, "grad_norm": 2.2145209312438965, "learning_rate": 2.84691166138784e-06, "loss": 3.8746, "step": 584 }, { "epoch": 0.4624505928853755, "grad_norm": 2.0563061237335205, "learning_rate": 2.8463591554089112e-06, "loss": 4.1634, "step": 585 }, { "epoch": 0.4632411067193676, "grad_norm": 2.3554129600524902, "learning_rate": 2.8458057080211736e-06, "loss": 3.9963, "step": 586 }, { "epoch": 0.46403162055335967, "grad_norm": 2.420006036758423, "learning_rate": 2.845251319611611e-06, "loss": 3.7184, "step": 587 }, { "epoch": 0.46482213438735176, "grad_norm": 2.145489454269409, "learning_rate": 2.8446959905678663e-06, "loss": 3.9338, "step": 588 }, { "epoch": 0.46561264822134385, "grad_norm": 2.283564567565918, "learning_rate": 2.8441397212782397e-06, "loss": 4.0669, "step": 589 }, { "epoch": 0.466403162055336, "grad_norm": 2.5423386096954346, "learning_rate": 2.8435825121316893e-06, "loss": 4.1297, "step": 590 }, { "epoch": 0.4671936758893281, "grad_norm": 2.0828325748443604, "learning_rate": 2.84302436351783e-06, "loss": 4.1111, "step": 591 }, { "epoch": 0.4679841897233202, "grad_norm": 2.157034397125244, "learning_rate": 2.842465275826933e-06, "loss": 3.9483, "step": 592 }, { "epoch": 0.46877470355731227, "grad_norm": 2.4070687294006348, "learning_rate": 2.841905249449927e-06, "loss": 3.8037, "step": 593 }, { "epoch": 0.46956521739130436, "grad_norm": 2.232006549835205, "learning_rate": 2.841344284778398e-06, "loss": 4.0877, "step": 594 }, { "epoch": 0.47035573122529645, "grad_norm": 2.2635626792907715, "learning_rate": 2.8407823822045845e-06, "loss": 3.9981, "step": 595 }, { "epoch": 0.47114624505928854, "grad_norm": 2.3407692909240723, "learning_rate": 2.8402195421213848e-06, "loss": 4.1059, "step": 596 }, { "epoch": 0.47193675889328063, "grad_norm": 2.365541934967041, "learning_rate": 2.839655764922351e-06, "loss": 4.1844, "step": 597 }, { "epoch": 0.4727272727272727, "grad_norm": 2.39309024810791, "learning_rate": 2.8390910510016897e-06, "loss": 4.2956, "step": 598 }, { "epoch": 0.4735177865612648, "grad_norm": 2.165889263153076, "learning_rate": 2.8385254007542645e-06, "loss": 4.2104, "step": 599 }, { "epoch": 0.4743083003952569, "grad_norm": 2.9026732444763184, "learning_rate": 2.8379588145755916e-06, "loss": 3.7711, "step": 600 }, { "epoch": 0.475098814229249, "grad_norm": 2.0680224895477295, "learning_rate": 2.8373912928618436e-06, "loss": 4.1002, "step": 601 }, { "epoch": 0.4758893280632411, "grad_norm": 3.1935153007507324, "learning_rate": 2.836822836009845e-06, "loss": 4.0975, "step": 602 }, { "epoch": 0.4766798418972332, "grad_norm": 2.2129180431365967, "learning_rate": 2.836253444417076e-06, "loss": 4.0719, "step": 603 }, { "epoch": 0.4774703557312253, "grad_norm": 2.199618101119995, "learning_rate": 2.83568311848167e-06, "loss": 4.2078, "step": 604 }, { "epoch": 0.4782608695652174, "grad_norm": 2.368140935897827, "learning_rate": 2.835111858602413e-06, "loss": 4.0887, "step": 605 }, { "epoch": 0.4790513833992095, "grad_norm": 2.209469795227051, "learning_rate": 2.834539665178745e-06, "loss": 3.9937, "step": 606 }, { "epoch": 0.4798418972332016, "grad_norm": 2.1404333114624023, "learning_rate": 2.833966538610758e-06, "loss": 3.9123, "step": 607 }, { "epoch": 0.4806324110671937, "grad_norm": 2.304215431213379, "learning_rate": 2.833392479299197e-06, "loss": 3.8439, "step": 608 }, { "epoch": 0.48142292490118577, "grad_norm": 2.2381908893585205, "learning_rate": 2.8328174876454584e-06, "loss": 4.0649, "step": 609 }, { "epoch": 0.48221343873517786, "grad_norm": 2.2449824810028076, "learning_rate": 2.8322415640515925e-06, "loss": 3.799, "step": 610 }, { "epoch": 0.48300395256916995, "grad_norm": 2.1535253524780273, "learning_rate": 2.831664708920298e-06, "loss": 4.2312, "step": 611 }, { "epoch": 0.48379446640316204, "grad_norm": 2.140223264694214, "learning_rate": 2.8310869226549277e-06, "loss": 4.2799, "step": 612 }, { "epoch": 0.48458498023715413, "grad_norm": 2.6043553352355957, "learning_rate": 2.8305082056594845e-06, "loss": 3.8953, "step": 613 }, { "epoch": 0.4853754940711462, "grad_norm": 2.254485845565796, "learning_rate": 2.829928558338623e-06, "loss": 3.8844, "step": 614 }, { "epoch": 0.48616600790513836, "grad_norm": 2.373791217803955, "learning_rate": 2.829347981097646e-06, "loss": 4.3709, "step": 615 }, { "epoch": 0.48695652173913045, "grad_norm": 2.0198941230773926, "learning_rate": 2.828766474342508e-06, "loss": 4.2513, "step": 616 }, { "epoch": 0.48774703557312254, "grad_norm": 2.3215599060058594, "learning_rate": 2.8281840384798147e-06, "loss": 4.1823, "step": 617 }, { "epoch": 0.48853754940711464, "grad_norm": 2.333178758621216, "learning_rate": 2.8276006739168196e-06, "loss": 3.8074, "step": 618 }, { "epoch": 0.4893280632411067, "grad_norm": 2.0495738983154297, "learning_rate": 2.827016381061426e-06, "loss": 3.9408, "step": 619 }, { "epoch": 0.4901185770750988, "grad_norm": 2.4462196826934814, "learning_rate": 2.826431160322186e-06, "loss": 4.0675, "step": 620 }, { "epoch": 0.4909090909090909, "grad_norm": 2.2717108726501465, "learning_rate": 2.825845012108302e-06, "loss": 3.7053, "step": 621 }, { "epoch": 0.491699604743083, "grad_norm": 2.2725746631622314, "learning_rate": 2.8252579368296224e-06, "loss": 3.7527, "step": 622 }, { "epoch": 0.4924901185770751, "grad_norm": 2.3421661853790283, "learning_rate": 2.8246699348966473e-06, "loss": 4.079, "step": 623 }, { "epoch": 0.4932806324110672, "grad_norm": 2.382239580154419, "learning_rate": 2.8240810067205207e-06, "loss": 3.8848, "step": 624 }, { "epoch": 0.49407114624505927, "grad_norm": 2.0931429862976074, "learning_rate": 2.8234911527130372e-06, "loss": 4.0422, "step": 625 }, { "epoch": 0.49486166007905136, "grad_norm": 2.5677289962768555, "learning_rate": 2.8229003732866377e-06, "loss": 3.9248, "step": 626 }, { "epoch": 0.4956521739130435, "grad_norm": 2.224529504776001, "learning_rate": 2.8223086688544108e-06, "loss": 4.1603, "step": 627 }, { "epoch": 0.4964426877470356, "grad_norm": 2.0841875076293945, "learning_rate": 2.82171603983009e-06, "loss": 4.212, "step": 628 }, { "epoch": 0.4972332015810277, "grad_norm": 2.05281662940979, "learning_rate": 2.8211224866280584e-06, "loss": 4.1907, "step": 629 }, { "epoch": 0.4980237154150198, "grad_norm": 2.1697418689727783, "learning_rate": 2.8205280096633426e-06, "loss": 4.2186, "step": 630 }, { "epoch": 0.49881422924901186, "grad_norm": 2.277703046798706, "learning_rate": 2.8199326093516164e-06, "loss": 3.7951, "step": 631 }, { "epoch": 0.49960474308300395, "grad_norm": 2.4729220867156982, "learning_rate": 2.8193362861091986e-06, "loss": 3.6974, "step": 632 }, { "epoch": 0.500395256916996, "grad_norm": 2.2112340927124023, "learning_rate": 2.8187390403530543e-06, "loss": 4.1716, "step": 633 }, { "epoch": 0.5011857707509881, "grad_norm": 2.618025302886963, "learning_rate": 2.818140872500793e-06, "loss": 4.0599, "step": 634 }, { "epoch": 0.5019762845849802, "grad_norm": 2.12532377243042, "learning_rate": 2.8175417829706683e-06, "loss": 4.1384, "step": 635 }, { "epoch": 0.5027667984189723, "grad_norm": 2.549034357070923, "learning_rate": 2.8169417721815793e-06, "loss": 4.0674, "step": 636 }, { "epoch": 0.5035573122529644, "grad_norm": 2.8372080326080322, "learning_rate": 2.8163408405530693e-06, "loss": 3.6026, "step": 637 }, { "epoch": 0.5043478260869565, "grad_norm": 2.359428644180298, "learning_rate": 2.815738988505324e-06, "loss": 3.8664, "step": 638 }, { "epoch": 0.5051383399209486, "grad_norm": 2.1748971939086914, "learning_rate": 2.8151362164591753e-06, "loss": 3.8317, "step": 639 }, { "epoch": 0.5059288537549407, "grad_norm": 2.1719791889190674, "learning_rate": 2.8145325248360958e-06, "loss": 4.1298, "step": 640 }, { "epoch": 0.5067193675889328, "grad_norm": 2.909386157989502, "learning_rate": 2.8139279140582023e-06, "loss": 3.8789, "step": 641 }, { "epoch": 0.5075098814229249, "grad_norm": 2.0672121047973633, "learning_rate": 2.8133223845482545e-06, "loss": 4.2678, "step": 642 }, { "epoch": 0.508300395256917, "grad_norm": 4.978974342346191, "learning_rate": 2.812715936729654e-06, "loss": 3.4823, "step": 643 }, { "epoch": 0.509090909090909, "grad_norm": 2.3040974140167236, "learning_rate": 2.8121085710264444e-06, "loss": 3.7972, "step": 644 }, { "epoch": 0.5098814229249012, "grad_norm": 2.371504306793213, "learning_rate": 2.811500287863311e-06, "loss": 4.0063, "step": 645 }, { "epoch": 0.5106719367588933, "grad_norm": 2.901454448699951, "learning_rate": 2.8108910876655824e-06, "loss": 3.7868, "step": 646 }, { "epoch": 0.5114624505928854, "grad_norm": 2.176074266433716, "learning_rate": 2.8102809708592254e-06, "loss": 3.9326, "step": 647 }, { "epoch": 0.5122529644268775, "grad_norm": 2.6125783920288086, "learning_rate": 2.8096699378708503e-06, "loss": 4.1597, "step": 648 }, { "epoch": 0.5130434782608696, "grad_norm": 2.9305193424224854, "learning_rate": 2.809057989127707e-06, "loss": 4.0594, "step": 649 }, { "epoch": 0.5138339920948617, "grad_norm": 2.2853384017944336, "learning_rate": 2.808445125057685e-06, "loss": 3.6683, "step": 650 }, { "epoch": 0.5146245059288538, "grad_norm": 2.2805874347686768, "learning_rate": 2.807831346089315e-06, "loss": 4.0592, "step": 651 }, { "epoch": 0.5154150197628459, "grad_norm": 2.424262762069702, "learning_rate": 2.8072166526517667e-06, "loss": 3.7423, "step": 652 }, { "epoch": 0.516205533596838, "grad_norm": 3.8474109172821045, "learning_rate": 2.80660104517485e-06, "loss": 3.7056, "step": 653 }, { "epoch": 0.51699604743083, "grad_norm": 2.7297778129577637, "learning_rate": 2.8059845240890127e-06, "loss": 3.9871, "step": 654 }, { "epoch": 0.5177865612648221, "grad_norm": 2.037923812866211, "learning_rate": 2.805367089825343e-06, "loss": 4.2278, "step": 655 }, { "epoch": 0.5185770750988142, "grad_norm": 1.9329725503921509, "learning_rate": 2.8047487428155665e-06, "loss": 4.3012, "step": 656 }, { "epoch": 0.5193675889328063, "grad_norm": 2.0355772972106934, "learning_rate": 2.8041294834920472e-06, "loss": 4.268, "step": 657 }, { "epoch": 0.5201581027667984, "grad_norm": 2.3696775436401367, "learning_rate": 2.8035093122877865e-06, "loss": 3.9854, "step": 658 }, { "epoch": 0.5209486166007905, "grad_norm": 3.532470703125, "learning_rate": 2.802888229636425e-06, "loss": 4.4068, "step": 659 }, { "epoch": 0.5217391304347826, "grad_norm": 2.3635644912719727, "learning_rate": 2.8022662359722387e-06, "loss": 4.0598, "step": 660 }, { "epoch": 0.5225296442687747, "grad_norm": 2.681267023086548, "learning_rate": 2.8016433317301426e-06, "loss": 4.2382, "step": 661 }, { "epoch": 0.5233201581027668, "grad_norm": 2.2814853191375732, "learning_rate": 2.801019517345686e-06, "loss": 3.6857, "step": 662 }, { "epoch": 0.5241106719367589, "grad_norm": 2.8479981422424316, "learning_rate": 2.8003947932550564e-06, "loss": 4.0725, "step": 663 }, { "epoch": 0.524901185770751, "grad_norm": 2.2539784908294678, "learning_rate": 2.7997691598950774e-06, "loss": 4.2103, "step": 664 }, { "epoch": 0.525691699604743, "grad_norm": 2.925358295440674, "learning_rate": 2.7991426177032075e-06, "loss": 3.981, "step": 665 }, { "epoch": 0.5264822134387351, "grad_norm": 2.615386724472046, "learning_rate": 2.7985151671175407e-06, "loss": 3.902, "step": 666 }, { "epoch": 0.5272727272727272, "grad_norm": 2.2849299907684326, "learning_rate": 2.7978868085768065e-06, "loss": 4.1873, "step": 667 }, { "epoch": 0.5280632411067193, "grad_norm": 2.2208516597747803, "learning_rate": 2.7972575425203703e-06, "loss": 3.9633, "step": 668 }, { "epoch": 0.5288537549407115, "grad_norm": 2.112748861312866, "learning_rate": 2.79662736938823e-06, "loss": 4.0706, "step": 669 }, { "epoch": 0.5296442687747036, "grad_norm": 2.9148709774017334, "learning_rate": 2.795996289621019e-06, "loss": 3.8787, "step": 670 }, { "epoch": 0.5304347826086957, "grad_norm": 2.6918740272521973, "learning_rate": 2.7953643036600053e-06, "loss": 3.6628, "step": 671 }, { "epoch": 0.5312252964426878, "grad_norm": 2.289088726043701, "learning_rate": 2.794731411947088e-06, "loss": 3.6531, "step": 672 }, { "epoch": 0.5320158102766799, "grad_norm": 2.0199086666107178, "learning_rate": 2.794097614924803e-06, "loss": 3.987, "step": 673 }, { "epoch": 0.532806324110672, "grad_norm": 2.4424777030944824, "learning_rate": 2.793462913036316e-06, "loss": 4.0931, "step": 674 }, { "epoch": 0.5335968379446641, "grad_norm": 3.127809524536133, "learning_rate": 2.7928273067254277e-06, "loss": 3.9234, "step": 675 }, { "epoch": 0.5343873517786562, "grad_norm": 2.2304930686950684, "learning_rate": 2.7921907964365703e-06, "loss": 3.6626, "step": 676 }, { "epoch": 0.5351778656126482, "grad_norm": 2.277078866958618, "learning_rate": 2.7915533826148068e-06, "loss": 4.1125, "step": 677 }, { "epoch": 0.5359683794466403, "grad_norm": 1.95723295211792, "learning_rate": 2.7909150657058353e-06, "loss": 4.3688, "step": 678 }, { "epoch": 0.5367588932806324, "grad_norm": 2.23193359375, "learning_rate": 2.7902758461559824e-06, "loss": 3.9887, "step": 679 }, { "epoch": 0.5375494071146245, "grad_norm": 2.244748592376709, "learning_rate": 2.7896357244122056e-06, "loss": 4.0277, "step": 680 }, { "epoch": 0.5383399209486166, "grad_norm": 2.42121958732605, "learning_rate": 2.7889947009220963e-06, "loss": 3.9704, "step": 681 }, { "epoch": 0.5391304347826087, "grad_norm": 2.3564934730529785, "learning_rate": 2.788352776133874e-06, "loss": 4.0001, "step": 682 }, { "epoch": 0.5399209486166008, "grad_norm": 2.0918052196502686, "learning_rate": 2.7877099504963883e-06, "loss": 4.2014, "step": 683 }, { "epoch": 0.5407114624505929, "grad_norm": 2.1483898162841797, "learning_rate": 2.78706622445912e-06, "loss": 4.2543, "step": 684 }, { "epoch": 0.541501976284585, "grad_norm": 2.398033618927002, "learning_rate": 2.7864215984721784e-06, "loss": 4.0991, "step": 685 }, { "epoch": 0.542292490118577, "grad_norm": 2.1454689502716064, "learning_rate": 2.7857760729863026e-06, "loss": 4.0833, "step": 686 }, { "epoch": 0.5430830039525691, "grad_norm": 2.5210142135620117, "learning_rate": 2.785129648452861e-06, "loss": 4.0775, "step": 687 }, { "epoch": 0.5438735177865612, "grad_norm": 2.152167558670044, "learning_rate": 2.7844823253238494e-06, "loss": 3.6543, "step": 688 }, { "epoch": 0.5446640316205533, "grad_norm": 2.021414041519165, "learning_rate": 2.783834104051894e-06, "loss": 4.3388, "step": 689 }, { "epoch": 0.5454545454545454, "grad_norm": 2.2559609413146973, "learning_rate": 2.783184985090246e-06, "loss": 4.553, "step": 690 }, { "epoch": 0.5462450592885375, "grad_norm": 3.4043941497802734, "learning_rate": 2.7825349688927874e-06, "loss": 3.7887, "step": 691 }, { "epoch": 0.5470355731225296, "grad_norm": 2.1067426204681396, "learning_rate": 2.7818840559140255e-06, "loss": 4.2243, "step": 692 }, { "epoch": 0.5478260869565217, "grad_norm": 2.0501089096069336, "learning_rate": 2.781232246609095e-06, "loss": 4.2791, "step": 693 }, { "epoch": 0.5486166007905139, "grad_norm": 2.3356237411499023, "learning_rate": 2.780579541433759e-06, "loss": 3.9583, "step": 694 }, { "epoch": 0.549407114624506, "grad_norm": 2.3383827209472656, "learning_rate": 2.7799259408444045e-06, "loss": 3.8229, "step": 695 }, { "epoch": 0.5501976284584981, "grad_norm": 2.993069648742676, "learning_rate": 2.779271445298046e-06, "loss": 3.589, "step": 696 }, { "epoch": 0.5509881422924902, "grad_norm": 2.245392322540283, "learning_rate": 2.778616055252324e-06, "loss": 3.9643, "step": 697 }, { "epoch": 0.5517786561264822, "grad_norm": 4.990372180938721, "learning_rate": 2.7779597711655037e-06, "loss": 3.7945, "step": 698 }, { "epoch": 0.5525691699604743, "grad_norm": 2.2842657566070557, "learning_rate": 2.7773025934964753e-06, "loss": 4.4629, "step": 699 }, { "epoch": 0.5533596837944664, "grad_norm": 2.0105669498443604, "learning_rate": 2.776644522704755e-06, "loss": 4.3025, "step": 700 }, { "epoch": 0.5541501976284585, "grad_norm": 2.468008041381836, "learning_rate": 2.775985559250482e-06, "loss": 3.8597, "step": 701 }, { "epoch": 0.5549407114624506, "grad_norm": 2.1786882877349854, "learning_rate": 2.7753257035944216e-06, "loss": 4.1261, "step": 702 }, { "epoch": 0.5557312252964427, "grad_norm": 2.292381763458252, "learning_rate": 2.7746649561979603e-06, "loss": 4.0201, "step": 703 }, { "epoch": 0.5565217391304348, "grad_norm": 2.408872365951538, "learning_rate": 2.7740033175231107e-06, "loss": 4.2469, "step": 704 }, { "epoch": 0.5573122529644269, "grad_norm": 2.1927480697631836, "learning_rate": 2.7733407880325074e-06, "loss": 3.9677, "step": 705 }, { "epoch": 0.558102766798419, "grad_norm": 2.017225503921509, "learning_rate": 2.772677368189407e-06, "loss": 4.3018, "step": 706 }, { "epoch": 0.5588932806324111, "grad_norm": 2.3277032375335693, "learning_rate": 2.772013058457691e-06, "loss": 4.0788, "step": 707 }, { "epoch": 0.5596837944664032, "grad_norm": 2.147150754928589, "learning_rate": 2.771347859301861e-06, "loss": 4.1446, "step": 708 }, { "epoch": 0.5604743083003952, "grad_norm": 2.637802839279175, "learning_rate": 2.7706817711870414e-06, "loss": 4.1923, "step": 709 }, { "epoch": 0.5612648221343873, "grad_norm": 2.3971617221832275, "learning_rate": 2.770014794578978e-06, "loss": 3.6373, "step": 710 }, { "epoch": 0.5620553359683794, "grad_norm": 2.2539501190185547, "learning_rate": 2.7693469299440383e-06, "loss": 3.7885, "step": 711 }, { "epoch": 0.5628458498023715, "grad_norm": 2.2222349643707275, "learning_rate": 2.76867817774921e-06, "loss": 3.8375, "step": 712 }, { "epoch": 0.5636363636363636, "grad_norm": 2.305565595626831, "learning_rate": 2.7680085384621025e-06, "loss": 3.8982, "step": 713 }, { "epoch": 0.5644268774703557, "grad_norm": 2.3534438610076904, "learning_rate": 2.7673380125509433e-06, "loss": 4.0982, "step": 714 }, { "epoch": 0.5652173913043478, "grad_norm": 2.238638401031494, "learning_rate": 2.7666666004845826e-06, "loss": 4.3363, "step": 715 }, { "epoch": 0.5660079051383399, "grad_norm": 2.405538558959961, "learning_rate": 2.765994302732488e-06, "loss": 3.9617, "step": 716 }, { "epoch": 0.566798418972332, "grad_norm": 2.07778263092041, "learning_rate": 2.7653211197647476e-06, "loss": 4.1974, "step": 717 }, { "epoch": 0.5675889328063242, "grad_norm": 2.6020214557647705, "learning_rate": 2.764647052052069e-06, "loss": 3.8138, "step": 718 }, { "epoch": 0.5683794466403163, "grad_norm": 2.932326078414917, "learning_rate": 2.7639721000657765e-06, "loss": 3.7021, "step": 719 }, { "epoch": 0.5691699604743083, "grad_norm": 2.331296443939209, "learning_rate": 2.7632962642778144e-06, "loss": 4.1306, "step": 720 }, { "epoch": 0.5699604743083004, "grad_norm": 2.068192720413208, "learning_rate": 2.7626195451607445e-06, "loss": 4.4361, "step": 721 }, { "epoch": 0.5707509881422925, "grad_norm": 2.4391589164733887, "learning_rate": 2.7619419431877453e-06, "loss": 4.061, "step": 722 }, { "epoch": 0.5715415019762846, "grad_norm": 2.043933391571045, "learning_rate": 2.7612634588326143e-06, "loss": 4.5997, "step": 723 }, { "epoch": 0.5723320158102767, "grad_norm": 2.186466693878174, "learning_rate": 2.7605840925697654e-06, "loss": 4.457, "step": 724 }, { "epoch": 0.5731225296442688, "grad_norm": 2.1467323303222656, "learning_rate": 2.759903844874228e-06, "loss": 3.9649, "step": 725 }, { "epoch": 0.5739130434782609, "grad_norm": 2.0007355213165283, "learning_rate": 2.759222716221649e-06, "loss": 4.1707, "step": 726 }, { "epoch": 0.574703557312253, "grad_norm": 2.461233377456665, "learning_rate": 2.758540707088291e-06, "loss": 3.859, "step": 727 }, { "epoch": 0.5754940711462451, "grad_norm": 2.7813708782196045, "learning_rate": 2.757857817951032e-06, "loss": 4.1143, "step": 728 }, { "epoch": 0.5762845849802372, "grad_norm": 2.369342803955078, "learning_rate": 2.7571740492873668e-06, "loss": 4.2502, "step": 729 }, { "epoch": 0.5770750988142292, "grad_norm": 2.2506895065307617, "learning_rate": 2.756489401575403e-06, "loss": 3.8049, "step": 730 }, { "epoch": 0.5778656126482213, "grad_norm": 2.371105194091797, "learning_rate": 2.7558038752938634e-06, "loss": 3.8163, "step": 731 }, { "epoch": 0.5786561264822134, "grad_norm": 2.0559427738189697, "learning_rate": 2.755117470922086e-06, "loss": 4.0139, "step": 732 }, { "epoch": 0.5794466403162055, "grad_norm": 5.048698425292969, "learning_rate": 2.754430188940023e-06, "loss": 4.2157, "step": 733 }, { "epoch": 0.5802371541501976, "grad_norm": 2.1967275142669678, "learning_rate": 2.7537420298282384e-06, "loss": 4.0414, "step": 734 }, { "epoch": 0.5810276679841897, "grad_norm": 2.1241018772125244, "learning_rate": 2.7530529940679116e-06, "loss": 4.1134, "step": 735 }, { "epoch": 0.5818181818181818, "grad_norm": 2.0556671619415283, "learning_rate": 2.7523630821408334e-06, "loss": 4.1303, "step": 736 }, { "epoch": 0.5826086956521739, "grad_norm": 2.2812421321868896, "learning_rate": 2.7516722945294082e-06, "loss": 3.8912, "step": 737 }, { "epoch": 0.583399209486166, "grad_norm": 2.0969622135162354, "learning_rate": 2.7509806317166523e-06, "loss": 4.1483, "step": 738 }, { "epoch": 0.5841897233201581, "grad_norm": 2.2219130992889404, "learning_rate": 2.750288094186194e-06, "loss": 4.3074, "step": 739 }, { "epoch": 0.5849802371541502, "grad_norm": 1.9734611511230469, "learning_rate": 2.7495946824222733e-06, "loss": 4.3058, "step": 740 }, { "epoch": 0.5857707509881422, "grad_norm": 2.2835097312927246, "learning_rate": 2.748900396909742e-06, "loss": 3.9823, "step": 741 }, { "epoch": 0.5865612648221344, "grad_norm": 2.217108964920044, "learning_rate": 2.7482052381340615e-06, "loss": 4.1581, "step": 742 }, { "epoch": 0.5873517786561265, "grad_norm": 2.1147851943969727, "learning_rate": 2.7475092065813057e-06, "loss": 4.4065, "step": 743 }, { "epoch": 0.5881422924901186, "grad_norm": 2.978458881378174, "learning_rate": 2.7468123027381562e-06, "loss": 3.9137, "step": 744 }, { "epoch": 0.5889328063241107, "grad_norm": 2.3670568466186523, "learning_rate": 2.746114527091907e-06, "loss": 3.8032, "step": 745 }, { "epoch": 0.5897233201581028, "grad_norm": 2.058431625366211, "learning_rate": 2.745415880130461e-06, "loss": 4.169, "step": 746 }, { "epoch": 0.5905138339920949, "grad_norm": 2.6442832946777344, "learning_rate": 2.7447163623423284e-06, "loss": 3.7209, "step": 747 }, { "epoch": 0.591304347826087, "grad_norm": 2.440147876739502, "learning_rate": 2.7440159742166325e-06, "loss": 3.8384, "step": 748 }, { "epoch": 0.5920948616600791, "grad_norm": 2.2360751628875732, "learning_rate": 2.7433147162431007e-06, "loss": 4.1513, "step": 749 }, { "epoch": 0.5928853754940712, "grad_norm": 2.5978939533233643, "learning_rate": 2.7426125889120704e-06, "loss": 3.9397, "step": 750 }, { "epoch": 0.5936758893280633, "grad_norm": 2.585742712020874, "learning_rate": 2.7419095927144884e-06, "loss": 3.6625, "step": 751 }, { "epoch": 0.5944664031620553, "grad_norm": 2.1294901371002197, "learning_rate": 2.7412057281419065e-06, "loss": 4.0389, "step": 752 }, { "epoch": 0.5952569169960474, "grad_norm": 2.14139986038208, "learning_rate": 2.740500995686486e-06, "loss": 4.0141, "step": 753 }, { "epoch": 0.5960474308300395, "grad_norm": 2.3223159313201904, "learning_rate": 2.7397953958409923e-06, "loss": 4.3141, "step": 754 }, { "epoch": 0.5968379446640316, "grad_norm": 2.1804556846618652, "learning_rate": 2.7390889290988e-06, "loss": 3.8075, "step": 755 }, { "epoch": 0.5976284584980237, "grad_norm": 2.5544402599334717, "learning_rate": 2.7383815959538893e-06, "loss": 3.8848, "step": 756 }, { "epoch": 0.5984189723320158, "grad_norm": 2.207568883895874, "learning_rate": 2.737673396900844e-06, "loss": 3.9454, "step": 757 }, { "epoch": 0.5992094861660079, "grad_norm": 2.358778953552246, "learning_rate": 2.7369643324348557e-06, "loss": 4.2534, "step": 758 }, { "epoch": 0.6, "grad_norm": 2.363548994064331, "learning_rate": 2.736254403051721e-06, "loss": 4.1079, "step": 759 }, { "epoch": 0.6007905138339921, "grad_norm": 2.524233818054199, "learning_rate": 2.7355436092478406e-06, "loss": 3.8594, "step": 760 }, { "epoch": 0.6015810276679842, "grad_norm": 2.1500375270843506, "learning_rate": 2.734831951520219e-06, "loss": 4.1897, "step": 761 }, { "epoch": 0.6023715415019762, "grad_norm": 2.136174201965332, "learning_rate": 2.734119430366466e-06, "loss": 4.2623, "step": 762 }, { "epoch": 0.6031620553359683, "grad_norm": 2.0907034873962402, "learning_rate": 2.7334060462847946e-06, "loss": 4.0476, "step": 763 }, { "epoch": 0.6039525691699604, "grad_norm": 2.60268235206604, "learning_rate": 2.7326917997740214e-06, "loss": 3.9609, "step": 764 }, { "epoch": 0.6047430830039525, "grad_norm": 2.1370322704315186, "learning_rate": 2.7319766913335655e-06, "loss": 4.1526, "step": 765 }, { "epoch": 0.6055335968379446, "grad_norm": 2.2533068656921387, "learning_rate": 2.731260721463449e-06, "loss": 4.1455, "step": 766 }, { "epoch": 0.6063241106719368, "grad_norm": 2.196826457977295, "learning_rate": 2.730543890664297e-06, "loss": 4.1561, "step": 767 }, { "epoch": 0.6071146245059289, "grad_norm": 2.3944432735443115, "learning_rate": 2.7298261994373353e-06, "loss": 4.2052, "step": 768 }, { "epoch": 0.607905138339921, "grad_norm": 2.3913824558258057, "learning_rate": 2.7291076482843922e-06, "loss": 4.0536, "step": 769 }, { "epoch": 0.6086956521739131, "grad_norm": 2.1915314197540283, "learning_rate": 2.7283882377078976e-06, "loss": 4.0304, "step": 770 }, { "epoch": 0.6094861660079052, "grad_norm": 2.4736485481262207, "learning_rate": 2.727667968210881e-06, "loss": 3.8405, "step": 771 }, { "epoch": 0.6102766798418973, "grad_norm": 3.0956010818481445, "learning_rate": 2.7269468402969737e-06, "loss": 3.9232, "step": 772 }, { "epoch": 0.6110671936758894, "grad_norm": 2.289487600326538, "learning_rate": 2.7262248544704065e-06, "loss": 4.3977, "step": 773 }, { "epoch": 0.6118577075098814, "grad_norm": 2.3759517669677734, "learning_rate": 2.7255020112360107e-06, "loss": 3.9031, "step": 774 }, { "epoch": 0.6126482213438735, "grad_norm": 2.6178770065307617, "learning_rate": 2.7247783110992167e-06, "loss": 4.126, "step": 775 }, { "epoch": 0.6134387351778656, "grad_norm": 2.34268856048584, "learning_rate": 2.7240537545660544e-06, "loss": 4.1544, "step": 776 }, { "epoch": 0.6142292490118577, "grad_norm": 2.388896942138672, "learning_rate": 2.7233283421431515e-06, "loss": 3.6655, "step": 777 }, { "epoch": 0.6150197628458498, "grad_norm": 2.1743743419647217, "learning_rate": 2.722602074337736e-06, "loss": 3.9286, "step": 778 }, { "epoch": 0.6158102766798419, "grad_norm": 2.1510653495788574, "learning_rate": 2.721874951657632e-06, "loss": 3.9295, "step": 779 }, { "epoch": 0.616600790513834, "grad_norm": 2.2961676120758057, "learning_rate": 2.721146974611263e-06, "loss": 3.6915, "step": 780 }, { "epoch": 0.6173913043478261, "grad_norm": 2.491870403289795, "learning_rate": 2.720418143707649e-06, "loss": 3.8641, "step": 781 }, { "epoch": 0.6181818181818182, "grad_norm": 2.2166571617126465, "learning_rate": 2.7196884594564073e-06, "loss": 4.1055, "step": 782 }, { "epoch": 0.6189723320158103, "grad_norm": 2.0465400218963623, "learning_rate": 2.7189579223677514e-06, "loss": 4.194, "step": 783 }, { "epoch": 0.6197628458498023, "grad_norm": 2.1891303062438965, "learning_rate": 2.718226532952492e-06, "loss": 4.2203, "step": 784 }, { "epoch": 0.6205533596837944, "grad_norm": 3.012010097503662, "learning_rate": 2.7174942917220354e-06, "loss": 3.6605, "step": 785 }, { "epoch": 0.6213438735177865, "grad_norm": 2.0964033603668213, "learning_rate": 2.716761199188383e-06, "loss": 4.052, "step": 786 }, { "epoch": 0.6221343873517786, "grad_norm": 2.1600875854492188, "learning_rate": 2.716027255864132e-06, "loss": 3.9919, "step": 787 }, { "epoch": 0.6229249011857707, "grad_norm": 2.192486524581909, "learning_rate": 2.715292462262475e-06, "loss": 4.0503, "step": 788 }, { "epoch": 0.6237154150197628, "grad_norm": 2.1649792194366455, "learning_rate": 2.7145568188971973e-06, "loss": 3.9761, "step": 789 }, { "epoch": 0.6245059288537549, "grad_norm": 3.2397007942199707, "learning_rate": 2.7138203262826807e-06, "loss": 3.6522, "step": 790 }, { "epoch": 0.6252964426877471, "grad_norm": 2.4287755489349365, "learning_rate": 2.713082984933899e-06, "loss": 4.3248, "step": 791 }, { "epoch": 0.6260869565217392, "grad_norm": 2.164283275604248, "learning_rate": 2.7123447953664203e-06, "loss": 3.9243, "step": 792 }, { "epoch": 0.6268774703557313, "grad_norm": 2.5148348808288574, "learning_rate": 2.7116057580964063e-06, "loss": 4.254, "step": 793 }, { "epoch": 0.6276679841897234, "grad_norm": 2.4681942462921143, "learning_rate": 2.7108658736406097e-06, "loss": 3.8851, "step": 794 }, { "epoch": 0.6284584980237155, "grad_norm": 2.078508138656616, "learning_rate": 2.7101251425163776e-06, "loss": 3.8823, "step": 795 }, { "epoch": 0.6292490118577075, "grad_norm": 2.1149725914001465, "learning_rate": 2.7093835652416473e-06, "loss": 4.106, "step": 796 }, { "epoch": 0.6300395256916996, "grad_norm": 2.52842116355896, "learning_rate": 2.708641142334949e-06, "loss": 3.6131, "step": 797 }, { "epoch": 0.6308300395256917, "grad_norm": 2.4161345958709717, "learning_rate": 2.7078978743154046e-06, "loss": 3.8973, "step": 798 }, { "epoch": 0.6316205533596838, "grad_norm": 2.250732421875, "learning_rate": 2.707153761702725e-06, "loss": 4.33, "step": 799 }, { "epoch": 0.6324110671936759, "grad_norm": 2.7428791522979736, "learning_rate": 2.706408805017214e-06, "loss": 4.1071, "step": 800 }, { "epoch": 0.633201581027668, "grad_norm": 2.3043484687805176, "learning_rate": 2.705663004779763e-06, "loss": 4.1071, "step": 801 }, { "epoch": 0.6339920948616601, "grad_norm": 2.02844500541687, "learning_rate": 2.7049163615118553e-06, "loss": 4.2363, "step": 802 }, { "epoch": 0.6347826086956522, "grad_norm": 2.531156063079834, "learning_rate": 2.7041688757355627e-06, "loss": 3.9487, "step": 803 }, { "epoch": 0.6355731225296443, "grad_norm": 2.187513589859009, "learning_rate": 2.7034205479735474e-06, "loss": 3.9865, "step": 804 }, { "epoch": 0.6363636363636364, "grad_norm": 2.0692331790924072, "learning_rate": 2.702671378749058e-06, "loss": 4.1053, "step": 805 }, { "epoch": 0.6371541501976284, "grad_norm": 3.608257532119751, "learning_rate": 2.701921368585934e-06, "loss": 3.6333, "step": 806 }, { "epoch": 0.6379446640316205, "grad_norm": 2.904628276824951, "learning_rate": 2.7011705180086017e-06, "loss": 3.793, "step": 807 }, { "epoch": 0.6387351778656126, "grad_norm": 2.5274531841278076, "learning_rate": 2.700418827542073e-06, "loss": 3.9712, "step": 808 }, { "epoch": 0.6395256916996047, "grad_norm": 2.8085296154022217, "learning_rate": 2.6996662977119517e-06, "loss": 4.0791, "step": 809 }, { "epoch": 0.6403162055335968, "grad_norm": 2.5384461879730225, "learning_rate": 2.698912929044425e-06, "loss": 3.5324, "step": 810 }, { "epoch": 0.6411067193675889, "grad_norm": 2.6160666942596436, "learning_rate": 2.698158722066267e-06, "loss": 3.9445, "step": 811 }, { "epoch": 0.641897233201581, "grad_norm": 2.0030596256256104, "learning_rate": 2.697403677304839e-06, "loss": 4.0427, "step": 812 }, { "epoch": 0.6426877470355731, "grad_norm": 2.345379590988159, "learning_rate": 2.6966477952880878e-06, "loss": 3.8593, "step": 813 }, { "epoch": 0.6434782608695652, "grad_norm": 2.766815185546875, "learning_rate": 2.6958910765445453e-06, "loss": 3.9392, "step": 814 }, { "epoch": 0.6442687747035574, "grad_norm": 2.4397313594818115, "learning_rate": 2.6951335216033286e-06, "loss": 3.8176, "step": 815 }, { "epoch": 0.6450592885375495, "grad_norm": 2.7999956607818604, "learning_rate": 2.6943751309941395e-06, "loss": 3.5528, "step": 816 }, { "epoch": 0.6458498023715415, "grad_norm": 2.2732229232788086, "learning_rate": 2.6936159052472645e-06, "loss": 4.0696, "step": 817 }, { "epoch": 0.6466403162055336, "grad_norm": 2.725722074508667, "learning_rate": 2.6928558448935727e-06, "loss": 3.5875, "step": 818 }, { "epoch": 0.6474308300395257, "grad_norm": 2.213855266571045, "learning_rate": 2.692094950464519e-06, "loss": 4.2364, "step": 819 }, { "epoch": 0.6482213438735178, "grad_norm": 2.1468918323516846, "learning_rate": 2.6913332224921397e-06, "loss": 3.9879, "step": 820 }, { "epoch": 0.6490118577075099, "grad_norm": 2.4947948455810547, "learning_rate": 2.690570661509054e-06, "loss": 3.7295, "step": 821 }, { "epoch": 0.649802371541502, "grad_norm": 2.4082276821136475, "learning_rate": 2.6898072680484653e-06, "loss": 3.8754, "step": 822 }, { "epoch": 0.6505928853754941, "grad_norm": 2.481489419937134, "learning_rate": 2.689043042644157e-06, "loss": 3.9458, "step": 823 }, { "epoch": 0.6513833992094862, "grad_norm": 2.0998198986053467, "learning_rate": 2.6882779858304953e-06, "loss": 4.2798, "step": 824 }, { "epoch": 0.6521739130434783, "grad_norm": 2.3810606002807617, "learning_rate": 2.6875120981424275e-06, "loss": 4.04, "step": 825 }, { "epoch": 0.6529644268774704, "grad_norm": 2.3978240489959717, "learning_rate": 2.6867453801154814e-06, "loss": 4.308, "step": 826 }, { "epoch": 0.6537549407114625, "grad_norm": 2.390634775161743, "learning_rate": 2.685977832285767e-06, "loss": 3.8228, "step": 827 }, { "epoch": 0.6545454545454545, "grad_norm": 2.2875757217407227, "learning_rate": 2.6852094551899724e-06, "loss": 4.3557, "step": 828 }, { "epoch": 0.6553359683794466, "grad_norm": 2.775069236755371, "learning_rate": 2.6844402493653667e-06, "loss": 4.2933, "step": 829 }, { "epoch": 0.6561264822134387, "grad_norm": 2.4061741828918457, "learning_rate": 2.6836702153497986e-06, "loss": 3.866, "step": 830 }, { "epoch": 0.6569169960474308, "grad_norm": 2.139143228530884, "learning_rate": 2.682899353681696e-06, "loss": 4.0703, "step": 831 }, { "epoch": 0.6577075098814229, "grad_norm": 2.1474788188934326, "learning_rate": 2.682127664900064e-06, "loss": 4.1683, "step": 832 }, { "epoch": 0.658498023715415, "grad_norm": 2.2777280807495117, "learning_rate": 2.6813551495444886e-06, "loss": 3.9372, "step": 833 }, { "epoch": 0.6592885375494071, "grad_norm": 2.524709463119507, "learning_rate": 2.6805818081551306e-06, "loss": 4.0635, "step": 834 }, { "epoch": 0.6600790513833992, "grad_norm": 2.274681329727173, "learning_rate": 2.6798076412727317e-06, "loss": 4.2907, "step": 835 }, { "epoch": 0.6608695652173913, "grad_norm": 2.082883596420288, "learning_rate": 2.6790326494386083e-06, "loss": 3.9274, "step": 836 }, { "epoch": 0.6616600790513834, "grad_norm": 2.1312081813812256, "learning_rate": 2.678256833194655e-06, "loss": 4.0829, "step": 837 }, { "epoch": 0.6624505928853754, "grad_norm": 2.7159502506256104, "learning_rate": 2.677480193083342e-06, "loss": 4.134, "step": 838 }, { "epoch": 0.6632411067193675, "grad_norm": 2.0130436420440674, "learning_rate": 2.676702729647716e-06, "loss": 4.0105, "step": 839 }, { "epoch": 0.6640316205533597, "grad_norm": 2.3664658069610596, "learning_rate": 2.6759244434314e-06, "loss": 3.9772, "step": 840 }, { "epoch": 0.6648221343873518, "grad_norm": 2.241194725036621, "learning_rate": 2.675145334978591e-06, "loss": 4.2012, "step": 841 }, { "epoch": 0.6656126482213439, "grad_norm": 2.307687759399414, "learning_rate": 2.6743654048340615e-06, "loss": 4.0693, "step": 842 }, { "epoch": 0.666403162055336, "grad_norm": 2.2648966312408447, "learning_rate": 2.6735846535431593e-06, "loss": 4.1404, "step": 843 }, { "epoch": 0.6671936758893281, "grad_norm": 2.7948384284973145, "learning_rate": 2.6728030816518047e-06, "loss": 4.2546, "step": 844 }, { "epoch": 0.6679841897233202, "grad_norm": 2.0937070846557617, "learning_rate": 2.6720206897064936e-06, "loss": 4.1157, "step": 845 }, { "epoch": 0.6687747035573123, "grad_norm": 2.207416296005249, "learning_rate": 2.6712374782542947e-06, "loss": 4.3187, "step": 846 }, { "epoch": 0.6695652173913044, "grad_norm": 2.2130327224731445, "learning_rate": 2.6704534478428485e-06, "loss": 3.993, "step": 847 }, { "epoch": 0.6703557312252965, "grad_norm": 2.1370534896850586, "learning_rate": 2.66966859902037e-06, "loss": 4.0306, "step": 848 }, { "epoch": 0.6711462450592885, "grad_norm": 2.064194917678833, "learning_rate": 2.6688829323356456e-06, "loss": 4.1692, "step": 849 }, { "epoch": 0.6719367588932806, "grad_norm": 2.6898372173309326, "learning_rate": 2.668096448338033e-06, "loss": 4.1655, "step": 850 }, { "epoch": 0.6727272727272727, "grad_norm": 2.34110689163208, "learning_rate": 2.6673091475774623e-06, "loss": 4.1885, "step": 851 }, { "epoch": 0.6735177865612648, "grad_norm": 2.2008838653564453, "learning_rate": 2.666521030604434e-06, "loss": 4.0724, "step": 852 }, { "epoch": 0.6743083003952569, "grad_norm": 2.198943614959717, "learning_rate": 2.66573209797002e-06, "loss": 3.8599, "step": 853 }, { "epoch": 0.675098814229249, "grad_norm": 2.8422276973724365, "learning_rate": 2.6649423502258624e-06, "loss": 4.0712, "step": 854 }, { "epoch": 0.6758893280632411, "grad_norm": 2.284669876098633, "learning_rate": 2.664151787924173e-06, "loss": 3.8973, "step": 855 }, { "epoch": 0.6766798418972332, "grad_norm": 1.9246364831924438, "learning_rate": 2.663360411617733e-06, "loss": 4.2161, "step": 856 }, { "epoch": 0.6774703557312253, "grad_norm": 2.057915449142456, "learning_rate": 2.6625682218598934e-06, "loss": 4.3339, "step": 857 }, { "epoch": 0.6782608695652174, "grad_norm": 2.8328659534454346, "learning_rate": 2.6617752192045725e-06, "loss": 3.8611, "step": 858 }, { "epoch": 0.6790513833992095, "grad_norm": 2.6512038707733154, "learning_rate": 2.6609814042062588e-06, "loss": 3.9386, "step": 859 }, { "epoch": 0.6798418972332015, "grad_norm": 2.456908702850342, "learning_rate": 2.6601867774200087e-06, "loss": 3.554, "step": 860 }, { "epoch": 0.6806324110671936, "grad_norm": 2.1387672424316406, "learning_rate": 2.6593913394014444e-06, "loss": 4.1387, "step": 861 }, { "epoch": 0.6814229249011857, "grad_norm": 1.9091566801071167, "learning_rate": 2.658595090706757e-06, "loss": 4.5453, "step": 862 }, { "epoch": 0.6822134387351778, "grad_norm": 2.513051986694336, "learning_rate": 2.6577980318927045e-06, "loss": 4.115, "step": 863 }, { "epoch": 0.68300395256917, "grad_norm": 2.218872308731079, "learning_rate": 2.6570001635166105e-06, "loss": 4.0824, "step": 864 }, { "epoch": 0.6837944664031621, "grad_norm": 2.3143410682678223, "learning_rate": 2.6562014861363644e-06, "loss": 3.8951, "step": 865 }, { "epoch": 0.6845849802371542, "grad_norm": 3.719550609588623, "learning_rate": 2.655402000310423e-06, "loss": 3.8061, "step": 866 }, { "epoch": 0.6853754940711463, "grad_norm": 2.295003652572632, "learning_rate": 2.6546017065978066e-06, "loss": 4.074, "step": 867 }, { "epoch": 0.6861660079051384, "grad_norm": 2.439589023590088, "learning_rate": 2.6538006055581014e-06, "loss": 4.1008, "step": 868 }, { "epoch": 0.6869565217391305, "grad_norm": 2.2425475120544434, "learning_rate": 2.652998697751457e-06, "loss": 4.2183, "step": 869 }, { "epoch": 0.6877470355731226, "grad_norm": 2.263718366622925, "learning_rate": 2.6521959837385883e-06, "loss": 4.1758, "step": 870 }, { "epoch": 0.6885375494071146, "grad_norm": 2.861074209213257, "learning_rate": 2.651392464080774e-06, "loss": 4.0069, "step": 871 }, { "epoch": 0.6893280632411067, "grad_norm": 2.618819236755371, "learning_rate": 2.650588139339855e-06, "loss": 3.5076, "step": 872 }, { "epoch": 0.6901185770750988, "grad_norm": 2.4157986640930176, "learning_rate": 2.649783010078236e-06, "loss": 4.2161, "step": 873 }, { "epoch": 0.6909090909090909, "grad_norm": 2.8157382011413574, "learning_rate": 2.648977076858884e-06, "loss": 4.059, "step": 874 }, { "epoch": 0.691699604743083, "grad_norm": 2.1759843826293945, "learning_rate": 2.648170340245328e-06, "loss": 4.161, "step": 875 }, { "epoch": 0.6924901185770751, "grad_norm": 2.2429354190826416, "learning_rate": 2.647362800801659e-06, "loss": 3.8952, "step": 876 }, { "epoch": 0.6932806324110672, "grad_norm": 2.1989526748657227, "learning_rate": 2.6465544590925297e-06, "loss": 4.1447, "step": 877 }, { "epoch": 0.6940711462450593, "grad_norm": 2.5280017852783203, "learning_rate": 2.645745315683153e-06, "loss": 4.0264, "step": 878 }, { "epoch": 0.6948616600790514, "grad_norm": 2.256199359893799, "learning_rate": 2.6449353711393026e-06, "loss": 3.8894, "step": 879 }, { "epoch": 0.6956521739130435, "grad_norm": 2.3584611415863037, "learning_rate": 2.644124626027312e-06, "loss": 3.8634, "step": 880 }, { "epoch": 0.6964426877470355, "grad_norm": 3.7781054973602295, "learning_rate": 2.643313080914076e-06, "loss": 3.6015, "step": 881 }, { "epoch": 0.6972332015810276, "grad_norm": 2.6256213188171387, "learning_rate": 2.6425007363670467e-06, "loss": 4.3039, "step": 882 }, { "epoch": 0.6980237154150197, "grad_norm": 2.3650383949279785, "learning_rate": 2.6416875929542376e-06, "loss": 3.9834, "step": 883 }, { "epoch": 0.6988142292490118, "grad_norm": 2.488135576248169, "learning_rate": 2.6408736512442176e-06, "loss": 4.0359, "step": 884 }, { "epoch": 0.6996047430830039, "grad_norm": 2.437453031539917, "learning_rate": 2.640058911806117e-06, "loss": 3.901, "step": 885 }, { "epoch": 0.700395256916996, "grad_norm": 2.2627832889556885, "learning_rate": 2.639243375209622e-06, "loss": 4.0306, "step": 886 }, { "epoch": 0.7011857707509881, "grad_norm": 2.3905584812164307, "learning_rate": 2.638427042024976e-06, "loss": 4.0682, "step": 887 }, { "epoch": 0.7019762845849803, "grad_norm": 3.5144238471984863, "learning_rate": 2.6376099128229818e-06, "loss": 3.9211, "step": 888 }, { "epoch": 0.7027667984189724, "grad_norm": 2.219741106033325, "learning_rate": 2.6367919881749954e-06, "loss": 3.8753, "step": 889 }, { "epoch": 0.7035573122529645, "grad_norm": 2.295553207397461, "learning_rate": 2.635973268652931e-06, "loss": 3.8117, "step": 890 }, { "epoch": 0.7043478260869566, "grad_norm": 2.0033695697784424, "learning_rate": 2.6351537548292587e-06, "loss": 4.1418, "step": 891 }, { "epoch": 0.7051383399209487, "grad_norm": 2.2431528568267822, "learning_rate": 2.6343334472770037e-06, "loss": 3.9587, "step": 892 }, { "epoch": 0.7059288537549407, "grad_norm": 2.1806976795196533, "learning_rate": 2.6335123465697457e-06, "loss": 4.1994, "step": 893 }, { "epoch": 0.7067193675889328, "grad_norm": 2.108940839767456, "learning_rate": 2.632690453281619e-06, "loss": 4.0614, "step": 894 }, { "epoch": 0.7075098814229249, "grad_norm": 2.1497490406036377, "learning_rate": 2.6318677679873137e-06, "loss": 4.3631, "step": 895 }, { "epoch": 0.708300395256917, "grad_norm": 2.2527408599853516, "learning_rate": 2.631044291262071e-06, "loss": 4.1585, "step": 896 }, { "epoch": 0.7090909090909091, "grad_norm": 2.0461881160736084, "learning_rate": 2.6302200236816876e-06, "loss": 4.2429, "step": 897 }, { "epoch": 0.7098814229249012, "grad_norm": 2.2599215507507324, "learning_rate": 2.6293949658225122e-06, "loss": 3.8385, "step": 898 }, { "epoch": 0.7106719367588933, "grad_norm": 2.3883345127105713, "learning_rate": 2.6285691182614466e-06, "loss": 4.1886, "step": 899 }, { "epoch": 0.7114624505928854, "grad_norm": 2.0081310272216797, "learning_rate": 2.627742481575945e-06, "loss": 4.1318, "step": 900 }, { "epoch": 0.7122529644268775, "grad_norm": 2.3132641315460205, "learning_rate": 2.626915056344012e-06, "loss": 4.1151, "step": 901 }, { "epoch": 0.7130434782608696, "grad_norm": 2.678299903869629, "learning_rate": 2.626086843144205e-06, "loss": 4.0515, "step": 902 }, { "epoch": 0.7138339920948616, "grad_norm": 2.3359146118164062, "learning_rate": 2.625257842555632e-06, "loss": 4.0174, "step": 903 }, { "epoch": 0.7146245059288537, "grad_norm": 2.2332403659820557, "learning_rate": 2.6244280551579518e-06, "loss": 3.829, "step": 904 }, { "epoch": 0.7154150197628458, "grad_norm": 2.352632522583008, "learning_rate": 2.623597481531372e-06, "loss": 4.2305, "step": 905 }, { "epoch": 0.7162055335968379, "grad_norm": 2.4200477600097656, "learning_rate": 2.6227661222566517e-06, "loss": 3.7384, "step": 906 }, { "epoch": 0.71699604743083, "grad_norm": 2.151242971420288, "learning_rate": 2.6219339779150986e-06, "loss": 3.9993, "step": 907 }, { "epoch": 0.7177865612648221, "grad_norm": 2.6561155319213867, "learning_rate": 2.621101049088569e-06, "loss": 3.635, "step": 908 }, { "epoch": 0.7185770750988142, "grad_norm": 2.277362585067749, "learning_rate": 2.6202673363594683e-06, "loss": 3.8459, "step": 909 }, { "epoch": 0.7193675889328063, "grad_norm": 2.076112985610962, "learning_rate": 2.6194328403107497e-06, "loss": 4.2828, "step": 910 }, { "epoch": 0.7201581027667984, "grad_norm": 2.6213555335998535, "learning_rate": 2.6185975615259136e-06, "loss": 3.8153, "step": 911 }, { "epoch": 0.7209486166007905, "grad_norm": 2.322824478149414, "learning_rate": 2.6177615005890096e-06, "loss": 4.3443, "step": 912 }, { "epoch": 0.7217391304347827, "grad_norm": 2.5151655673980713, "learning_rate": 2.6169246580846323e-06, "loss": 3.8787, "step": 913 }, { "epoch": 0.7225296442687748, "grad_norm": 2.27738618850708, "learning_rate": 2.616087034597923e-06, "loss": 3.7166, "step": 914 }, { "epoch": 0.7233201581027668, "grad_norm": 2.36220645904541, "learning_rate": 2.6152486307145705e-06, "loss": 3.9584, "step": 915 }, { "epoch": 0.7241106719367589, "grad_norm": 2.3943774700164795, "learning_rate": 2.614409447020807e-06, "loss": 3.6486, "step": 916 }, { "epoch": 0.724901185770751, "grad_norm": 3.82273006439209, "learning_rate": 2.6135694841034122e-06, "loss": 3.9474, "step": 917 }, { "epoch": 0.7256916996047431, "grad_norm": 2.472172260284424, "learning_rate": 2.61272874254971e-06, "loss": 3.4366, "step": 918 }, { "epoch": 0.7264822134387352, "grad_norm": 2.204956293106079, "learning_rate": 2.611887222947567e-06, "loss": 3.8958, "step": 919 }, { "epoch": 0.7272727272727273, "grad_norm": 2.1740472316741943, "learning_rate": 2.6110449258853962e-06, "loss": 3.995, "step": 920 }, { "epoch": 0.7280632411067194, "grad_norm": 2.3041067123413086, "learning_rate": 2.610201851952153e-06, "loss": 4.1623, "step": 921 }, { "epoch": 0.7288537549407115, "grad_norm": 2.3100945949554443, "learning_rate": 2.609358001737337e-06, "loss": 4.3393, "step": 922 }, { "epoch": 0.7296442687747036, "grad_norm": 2.1752612590789795, "learning_rate": 2.608513375830989e-06, "loss": 4.156, "step": 923 }, { "epoch": 0.7304347826086957, "grad_norm": 2.1057252883911133, "learning_rate": 2.607667974823693e-06, "loss": 3.8507, "step": 924 }, { "epoch": 0.7312252964426877, "grad_norm": 2.676820755004883, "learning_rate": 2.606821799306576e-06, "loss": 3.7325, "step": 925 }, { "epoch": 0.7320158102766798, "grad_norm": 2.391728162765503, "learning_rate": 2.605974849871305e-06, "loss": 3.6163, "step": 926 }, { "epoch": 0.7328063241106719, "grad_norm": 2.1381988525390625, "learning_rate": 2.6051271271100885e-06, "loss": 3.9786, "step": 927 }, { "epoch": 0.733596837944664, "grad_norm": 2.436816692352295, "learning_rate": 2.6042786316156764e-06, "loss": 3.9796, "step": 928 }, { "epoch": 0.7343873517786561, "grad_norm": 2.2237181663513184, "learning_rate": 2.6034293639813586e-06, "loss": 4.3072, "step": 929 }, { "epoch": 0.7351778656126482, "grad_norm": 2.2231452465057373, "learning_rate": 2.6025793248009643e-06, "loss": 4.2774, "step": 930 }, { "epoch": 0.7359683794466403, "grad_norm": 2.3364923000335693, "learning_rate": 2.601728514668863e-06, "loss": 4.039, "step": 931 }, { "epoch": 0.7367588932806324, "grad_norm": 2.2584805488586426, "learning_rate": 2.6008769341799627e-06, "loss": 3.7703, "step": 932 }, { "epoch": 0.7375494071146245, "grad_norm": 2.3611459732055664, "learning_rate": 2.60002458392971e-06, "loss": 3.7913, "step": 933 }, { "epoch": 0.7383399209486166, "grad_norm": 2.279327630996704, "learning_rate": 2.599171464514091e-06, "loss": 4.0986, "step": 934 }, { "epoch": 0.7391304347826086, "grad_norm": 2.6149280071258545, "learning_rate": 2.598317576529628e-06, "loss": 4.2062, "step": 935 }, { "epoch": 0.7399209486166007, "grad_norm": 2.349127769470215, "learning_rate": 2.5974629205733815e-06, "loss": 4.1281, "step": 936 }, { "epoch": 0.7407114624505929, "grad_norm": 2.396867275238037, "learning_rate": 2.5966074972429488e-06, "loss": 3.897, "step": 937 }, { "epoch": 0.741501976284585, "grad_norm": 2.6701269149780273, "learning_rate": 2.5957513071364632e-06, "loss": 3.7002, "step": 938 }, { "epoch": 0.7422924901185771, "grad_norm": 2.1795315742492676, "learning_rate": 2.5948943508525962e-06, "loss": 4.2792, "step": 939 }, { "epoch": 0.7430830039525692, "grad_norm": 2.3323276042938232, "learning_rate": 2.594036628990552e-06, "loss": 4.1539, "step": 940 }, { "epoch": 0.7438735177865613, "grad_norm": 2.6732797622680664, "learning_rate": 2.593178142150073e-06, "loss": 3.9515, "step": 941 }, { "epoch": 0.7446640316205534, "grad_norm": 2.0245494842529297, "learning_rate": 2.5923188909314346e-06, "loss": 4.2551, "step": 942 }, { "epoch": 0.7454545454545455, "grad_norm": 1.9830007553100586, "learning_rate": 2.5914588759354468e-06, "loss": 4.3873, "step": 943 }, { "epoch": 0.7462450592885376, "grad_norm": 2.4068686962127686, "learning_rate": 2.590598097763455e-06, "loss": 4.1314, "step": 944 }, { "epoch": 0.7470355731225297, "grad_norm": 2.229933261871338, "learning_rate": 2.589736557017337e-06, "loss": 3.8728, "step": 945 }, { "epoch": 0.7478260869565218, "grad_norm": 2.451969623565674, "learning_rate": 2.588874254299504e-06, "loss": 3.9637, "step": 946 }, { "epoch": 0.7486166007905138, "grad_norm": 2.316436529159546, "learning_rate": 2.5880111902129e-06, "loss": 4.0349, "step": 947 }, { "epoch": 0.7494071146245059, "grad_norm": 2.3852484226226807, "learning_rate": 2.5871473653610016e-06, "loss": 3.8627, "step": 948 }, { "epoch": 0.750197628458498, "grad_norm": 2.3170361518859863, "learning_rate": 2.5862827803478183e-06, "loss": 3.8868, "step": 949 }, { "epoch": 0.7509881422924901, "grad_norm": 2.05137300491333, "learning_rate": 2.585417435777888e-06, "loss": 4.2488, "step": 950 }, { "epoch": 0.7517786561264822, "grad_norm": 2.0052125453948975, "learning_rate": 2.5845513322562836e-06, "loss": 4.244, "step": 951 }, { "epoch": 0.7525691699604743, "grad_norm": 2.2393226623535156, "learning_rate": 2.5836844703886066e-06, "loss": 4.0544, "step": 952 }, { "epoch": 0.7533596837944664, "grad_norm": 2.656914710998535, "learning_rate": 2.5828168507809885e-06, "loss": 4.0542, "step": 953 }, { "epoch": 0.7541501976284585, "grad_norm": 2.188429355621338, "learning_rate": 2.5819484740400914e-06, "loss": 4.0704, "step": 954 }, { "epoch": 0.7549407114624506, "grad_norm": 2.312324285507202, "learning_rate": 2.581079340773106e-06, "loss": 4.1455, "step": 955 }, { "epoch": 0.7557312252964427, "grad_norm": 3.7319936752319336, "learning_rate": 2.5802094515877536e-06, "loss": 4.0647, "step": 956 }, { "epoch": 0.7565217391304347, "grad_norm": 2.043283224105835, "learning_rate": 2.579338807092282e-06, "loss": 4.1997, "step": 957 }, { "epoch": 0.7573122529644268, "grad_norm": 2.1624205112457275, "learning_rate": 2.5784674078954693e-06, "loss": 4.1069, "step": 958 }, { "epoch": 0.7581027667984189, "grad_norm": 1.975084900856018, "learning_rate": 2.577595254606619e-06, "loss": 4.2481, "step": 959 }, { "epoch": 0.758893280632411, "grad_norm": 2.5662841796875, "learning_rate": 2.5767223478355634e-06, "loss": 3.6306, "step": 960 }, { "epoch": 0.7596837944664032, "grad_norm": 2.234062433242798, "learning_rate": 2.575848688192661e-06, "loss": 4.1144, "step": 961 }, { "epoch": 0.7604743083003953, "grad_norm": 2.219942569732666, "learning_rate": 2.5749742762887978e-06, "loss": 4.387, "step": 962 }, { "epoch": 0.7612648221343874, "grad_norm": 2.189321517944336, "learning_rate": 2.5740991127353845e-06, "loss": 4.4305, "step": 963 }, { "epoch": 0.7620553359683795, "grad_norm": 2.498145580291748, "learning_rate": 2.573223198144358e-06, "loss": 3.5523, "step": 964 }, { "epoch": 0.7628458498023716, "grad_norm": 3.0862233638763428, "learning_rate": 2.57234653312818e-06, "loss": 4.0363, "step": 965 }, { "epoch": 0.7636363636363637, "grad_norm": 2.422774314880371, "learning_rate": 2.571469118299837e-06, "loss": 3.9101, "step": 966 }, { "epoch": 0.7644268774703558, "grad_norm": 2.4641342163085938, "learning_rate": 2.5705909542728408e-06, "loss": 3.8479, "step": 967 }, { "epoch": 0.7652173913043478, "grad_norm": 2.5311570167541504, "learning_rate": 2.569712041661225e-06, "loss": 3.9859, "step": 968 }, { "epoch": 0.7660079051383399, "grad_norm": 2.3989365100860596, "learning_rate": 2.568832381079548e-06, "loss": 4.1524, "step": 969 }, { "epoch": 0.766798418972332, "grad_norm": 2.174476146697998, "learning_rate": 2.5679519731428923e-06, "loss": 3.8997, "step": 970 }, { "epoch": 0.7675889328063241, "grad_norm": 2.461684465408325, "learning_rate": 2.5670708184668606e-06, "loss": 4.1276, "step": 971 }, { "epoch": 0.7683794466403162, "grad_norm": 2.5029027462005615, "learning_rate": 2.5661889176675784e-06, "loss": 3.8472, "step": 972 }, { "epoch": 0.7691699604743083, "grad_norm": 2.197340965270996, "learning_rate": 2.5653062713616947e-06, "loss": 3.9801, "step": 973 }, { "epoch": 0.7699604743083004, "grad_norm": 2.3003313541412354, "learning_rate": 2.564422880166377e-06, "loss": 4.0012, "step": 974 }, { "epoch": 0.7707509881422925, "grad_norm": 2.419194221496582, "learning_rate": 2.5635387446993155e-06, "loss": 3.8685, "step": 975 }, { "epoch": 0.7715415019762846, "grad_norm": 3.388212203979492, "learning_rate": 2.562653865578721e-06, "loss": 3.636, "step": 976 }, { "epoch": 0.7723320158102767, "grad_norm": 2.2525463104248047, "learning_rate": 2.5617682434233232e-06, "loss": 3.8776, "step": 977 }, { "epoch": 0.7731225296442688, "grad_norm": 2.0554442405700684, "learning_rate": 2.560881878852372e-06, "loss": 3.9552, "step": 978 }, { "epoch": 0.7739130434782608, "grad_norm": 2.7389047145843506, "learning_rate": 2.5599947724856355e-06, "loss": 4.0081, "step": 979 }, { "epoch": 0.7747035573122529, "grad_norm": 2.169696807861328, "learning_rate": 2.559106924943402e-06, "loss": 3.9513, "step": 980 }, { "epoch": 0.775494071146245, "grad_norm": 2.6913578510284424, "learning_rate": 2.558218336846477e-06, "loss": 4.2538, "step": 981 }, { "epoch": 0.7762845849802371, "grad_norm": 2.20332670211792, "learning_rate": 2.5573290088161834e-06, "loss": 4.1513, "step": 982 }, { "epoch": 0.7770750988142292, "grad_norm": 2.0874626636505127, "learning_rate": 2.556438941474363e-06, "loss": 4.008, "step": 983 }, { "epoch": 0.7778656126482213, "grad_norm": 2.22428822517395, "learning_rate": 2.5555481354433736e-06, "loss": 4.2824, "step": 984 }, { "epoch": 0.7786561264822134, "grad_norm": 2.162810802459717, "learning_rate": 2.5546565913460897e-06, "loss": 4.0752, "step": 985 }, { "epoch": 0.7794466403162056, "grad_norm": 2.6237218379974365, "learning_rate": 2.553764309805901e-06, "loss": 3.8888, "step": 986 }, { "epoch": 0.7802371541501977, "grad_norm": 2.358827590942383, "learning_rate": 2.5528712914467145e-06, "loss": 3.7527, "step": 987 }, { "epoch": 0.7810276679841898, "grad_norm": 3.074803113937378, "learning_rate": 2.551977536892951e-06, "loss": 4.2503, "step": 988 }, { "epoch": 0.7818181818181819, "grad_norm": 2.729013681411743, "learning_rate": 2.5510830467695465e-06, "loss": 3.593, "step": 989 }, { "epoch": 0.782608695652174, "grad_norm": 2.1288058757781982, "learning_rate": 2.550187821701952e-06, "loss": 4.054, "step": 990 }, { "epoch": 0.783399209486166, "grad_norm": 2.720097780227661, "learning_rate": 2.5492918623161307e-06, "loss": 3.8738, "step": 991 }, { "epoch": 0.7841897233201581, "grad_norm": 2.09617280960083, "learning_rate": 2.5483951692385615e-06, "loss": 4.2999, "step": 992 }, { "epoch": 0.7849802371541502, "grad_norm": 2.3726985454559326, "learning_rate": 2.547497743096235e-06, "loss": 4.1372, "step": 993 }, { "epoch": 0.7857707509881423, "grad_norm": 2.346933364868164, "learning_rate": 2.5465995845166533e-06, "loss": 4.1788, "step": 994 }, { "epoch": 0.7865612648221344, "grad_norm": 2.3351120948791504, "learning_rate": 2.5457006941278336e-06, "loss": 3.9011, "step": 995 }, { "epoch": 0.7873517786561265, "grad_norm": 2.273986339569092, "learning_rate": 2.544801072558302e-06, "loss": 3.8561, "step": 996 }, { "epoch": 0.7881422924901186, "grad_norm": 2.4926962852478027, "learning_rate": 2.5439007204370978e-06, "loss": 4.034, "step": 997 }, { "epoch": 0.7889328063241107, "grad_norm": 2.6123082637786865, "learning_rate": 2.5429996383937693e-06, "loss": 4.1832, "step": 998 }, { "epoch": 0.7897233201581028, "grad_norm": 2.231175422668457, "learning_rate": 2.5420978270583767e-06, "loss": 4.2723, "step": 999 }, { "epoch": 0.7905138339920948, "grad_norm": 2.0974769592285156, "learning_rate": 2.5411952870614894e-06, "loss": 4.0911, "step": 1000 }, { "epoch": 0.7905138339920948, "eval_loss": 3.9191253185272217, "eval_runtime": 5.9485, "eval_samples_per_second": 504.333, "eval_steps_per_second": 2.185, "step": 1000 }, { "epoch": 0.7913043478260869, "grad_norm": 2.3267786502838135, "learning_rate": 2.5402920190341867e-06, "loss": 3.7713, "step": 1001 }, { "epoch": 0.792094861660079, "grad_norm": 2.4680113792419434, "learning_rate": 2.539388023608056e-06, "loss": 4.3457, "step": 1002 }, { "epoch": 0.7928853754940711, "grad_norm": 2.045832633972168, "learning_rate": 2.538483301415196e-06, "loss": 4.1167, "step": 1003 }, { "epoch": 0.7936758893280632, "grad_norm": 2.0550899505615234, "learning_rate": 2.5375778530882093e-06, "loss": 4.2526, "step": 1004 }, { "epoch": 0.7944664031620553, "grad_norm": 2.1613359451293945, "learning_rate": 2.5366716792602094e-06, "loss": 3.9528, "step": 1005 }, { "epoch": 0.7952569169960474, "grad_norm": 2.3303403854370117, "learning_rate": 2.535764780564817e-06, "loss": 3.9862, "step": 1006 }, { "epoch": 0.7960474308300395, "grad_norm": 2.3874752521514893, "learning_rate": 2.534857157636158e-06, "loss": 3.9893, "step": 1007 }, { "epoch": 0.7968379446640316, "grad_norm": 2.179624319076538, "learning_rate": 2.5339488111088663e-06, "loss": 4.0841, "step": 1008 }, { "epoch": 0.7976284584980237, "grad_norm": 2.2154030799865723, "learning_rate": 2.53303974161808e-06, "loss": 3.977, "step": 1009 }, { "epoch": 0.7984189723320159, "grad_norm": 2.4224047660827637, "learning_rate": 2.532129949799445e-06, "loss": 4.1625, "step": 1010 }, { "epoch": 0.799209486166008, "grad_norm": 2.2478575706481934, "learning_rate": 2.5312194362891108e-06, "loss": 4.272, "step": 1011 }, { "epoch": 0.8, "grad_norm": 2.3082523345947266, "learning_rate": 2.5303082017237305e-06, "loss": 3.8187, "step": 1012 }, { "epoch": 0.8007905138339921, "grad_norm": 2.336362361907959, "learning_rate": 2.529396246740464e-06, "loss": 3.9378, "step": 1013 }, { "epoch": 0.8015810276679842, "grad_norm": 2.4469549655914307, "learning_rate": 2.5284835719769735e-06, "loss": 3.8783, "step": 1014 }, { "epoch": 0.8023715415019763, "grad_norm": 2.4515938758850098, "learning_rate": 2.5275701780714237e-06, "loss": 4.0861, "step": 1015 }, { "epoch": 0.8031620553359684, "grad_norm": 2.7598514556884766, "learning_rate": 2.526656065662484e-06, "loss": 4.171, "step": 1016 }, { "epoch": 0.8039525691699605, "grad_norm": 2.239118814468384, "learning_rate": 2.5257412353893247e-06, "loss": 4.0948, "step": 1017 }, { "epoch": 0.8047430830039526, "grad_norm": 2.32548451423645, "learning_rate": 2.5248256878916187e-06, "loss": 4.2434, "step": 1018 }, { "epoch": 0.8055335968379447, "grad_norm": 2.357783079147339, "learning_rate": 2.5239094238095404e-06, "loss": 3.4441, "step": 1019 }, { "epoch": 0.8063241106719368, "grad_norm": 2.058227300643921, "learning_rate": 2.5229924437837647e-06, "loss": 3.9499, "step": 1020 }, { "epoch": 0.8071146245059289, "grad_norm": 2.669065475463867, "learning_rate": 2.5220747484554672e-06, "loss": 4.0444, "step": 1021 }, { "epoch": 0.807905138339921, "grad_norm": 2.1738414764404297, "learning_rate": 2.521156338466326e-06, "loss": 4.114, "step": 1022 }, { "epoch": 0.808695652173913, "grad_norm": 2.218989372253418, "learning_rate": 2.5202372144585143e-06, "loss": 3.9668, "step": 1023 }, { "epoch": 0.8094861660079051, "grad_norm": 2.310152530670166, "learning_rate": 2.5193173770747087e-06, "loss": 3.9796, "step": 1024 }, { "epoch": 0.8102766798418972, "grad_norm": 2.244147300720215, "learning_rate": 2.518396826958082e-06, "loss": 4.1388, "step": 1025 }, { "epoch": 0.8110671936758893, "grad_norm": 2.7011163234710693, "learning_rate": 2.517475564752308e-06, "loss": 4.207, "step": 1026 }, { "epoch": 0.8118577075098814, "grad_norm": 2.0463356971740723, "learning_rate": 2.516553591101555e-06, "loss": 4.337, "step": 1027 }, { "epoch": 0.8126482213438735, "grad_norm": 2.2174227237701416, "learning_rate": 2.515630906650492e-06, "loss": 3.7249, "step": 1028 }, { "epoch": 0.8134387351778656, "grad_norm": 2.423945188522339, "learning_rate": 2.5147075120442833e-06, "loss": 4.0487, "step": 1029 }, { "epoch": 0.8142292490118577, "grad_norm": 1.9724520444869995, "learning_rate": 2.5137834079285895e-06, "loss": 4.4374, "step": 1030 }, { "epoch": 0.8150197628458498, "grad_norm": 1.998605728149414, "learning_rate": 2.512858594949568e-06, "loss": 4.2296, "step": 1031 }, { "epoch": 0.8158102766798419, "grad_norm": 2.195652723312378, "learning_rate": 2.5119330737538725e-06, "loss": 4.2062, "step": 1032 }, { "epoch": 0.8166007905138339, "grad_norm": 2.102269172668457, "learning_rate": 2.5110068449886505e-06, "loss": 4.051, "step": 1033 }, { "epoch": 0.8173913043478261, "grad_norm": 2.215679883956909, "learning_rate": 2.5100799093015443e-06, "loss": 4.1764, "step": 1034 }, { "epoch": 0.8181818181818182, "grad_norm": 2.3267292976379395, "learning_rate": 2.5091522673406925e-06, "loss": 3.9706, "step": 1035 }, { "epoch": 0.8189723320158103, "grad_norm": 2.07296085357666, "learning_rate": 2.508223919754725e-06, "loss": 4.0473, "step": 1036 }, { "epoch": 0.8197628458498024, "grad_norm": 2.1522042751312256, "learning_rate": 2.5072948671927663e-06, "loss": 4.0118, "step": 1037 }, { "epoch": 0.8205533596837945, "grad_norm": 2.250399589538574, "learning_rate": 2.5063651103044343e-06, "loss": 4.1632, "step": 1038 }, { "epoch": 0.8213438735177866, "grad_norm": 2.261638641357422, "learning_rate": 2.5054346497398392e-06, "loss": 4.1931, "step": 1039 }, { "epoch": 0.8221343873517787, "grad_norm": 2.6754367351531982, "learning_rate": 2.504503486149582e-06, "loss": 3.7823, "step": 1040 }, { "epoch": 0.8229249011857708, "grad_norm": 2.1684045791625977, "learning_rate": 2.5035716201847553e-06, "loss": 4.4447, "step": 1041 }, { "epoch": 0.8237154150197629, "grad_norm": 2.3168790340423584, "learning_rate": 2.5026390524969455e-06, "loss": 4.0091, "step": 1042 }, { "epoch": 0.824505928853755, "grad_norm": 2.4007792472839355, "learning_rate": 2.5017057837382265e-06, "loss": 3.5009, "step": 1043 }, { "epoch": 0.825296442687747, "grad_norm": 2.419252634048462, "learning_rate": 2.5007718145611647e-06, "loss": 3.976, "step": 1044 }, { "epoch": 0.8260869565217391, "grad_norm": 2.2624294757843018, "learning_rate": 2.499837145618815e-06, "loss": 3.919, "step": 1045 }, { "epoch": 0.8268774703557312, "grad_norm": 2.1379823684692383, "learning_rate": 2.4989017775647214e-06, "loss": 4.0008, "step": 1046 }, { "epoch": 0.8276679841897233, "grad_norm": 2.0735857486724854, "learning_rate": 2.497965711052917e-06, "loss": 4.1076, "step": 1047 }, { "epoch": 0.8284584980237154, "grad_norm": 2.2392518520355225, "learning_rate": 2.4970289467379247e-06, "loss": 4.1115, "step": 1048 }, { "epoch": 0.8292490118577075, "grad_norm": 2.1649582386016846, "learning_rate": 2.496091485274753e-06, "loss": 4.0648, "step": 1049 }, { "epoch": 0.8300395256916996, "grad_norm": 2.2320902347564697, "learning_rate": 2.4951533273188995e-06, "loss": 4.1212, "step": 1050 }, { "epoch": 0.8308300395256917, "grad_norm": 2.126096725463867, "learning_rate": 2.494214473526348e-06, "loss": 4.1739, "step": 1051 }, { "epoch": 0.8316205533596838, "grad_norm": 2.8018832206726074, "learning_rate": 2.4932749245535695e-06, "loss": 3.8412, "step": 1052 }, { "epoch": 0.8324110671936759, "grad_norm": 2.6147327423095703, "learning_rate": 2.4923346810575196e-06, "loss": 4.0732, "step": 1053 }, { "epoch": 0.833201581027668, "grad_norm": 2.6588046550750732, "learning_rate": 2.491393743695642e-06, "loss": 4.2475, "step": 1054 }, { "epoch": 0.83399209486166, "grad_norm": 2.1526272296905518, "learning_rate": 2.4904521131258644e-06, "loss": 4.2125, "step": 1055 }, { "epoch": 0.8347826086956521, "grad_norm": 2.370558738708496, "learning_rate": 2.489509790006597e-06, "loss": 4.2047, "step": 1056 }, { "epoch": 0.8355731225296442, "grad_norm": 2.123455286026001, "learning_rate": 2.4885667749967386e-06, "loss": 4.1653, "step": 1057 }, { "epoch": 0.8363636363636363, "grad_norm": 2.368293523788452, "learning_rate": 2.4876230687556677e-06, "loss": 4.0319, "step": 1058 }, { "epoch": 0.8371541501976285, "grad_norm": 2.2858729362487793, "learning_rate": 2.4866786719432488e-06, "loss": 4.1394, "step": 1059 }, { "epoch": 0.8379446640316206, "grad_norm": 2.143429756164551, "learning_rate": 2.4857335852198283e-06, "loss": 4.119, "step": 1060 }, { "epoch": 0.8387351778656127, "grad_norm": 2.282593011856079, "learning_rate": 2.484787809246235e-06, "loss": 4.0537, "step": 1061 }, { "epoch": 0.8395256916996048, "grad_norm": 2.0674726963043213, "learning_rate": 2.483841344683779e-06, "loss": 4.1941, "step": 1062 }, { "epoch": 0.8403162055335969, "grad_norm": 2.234412908554077, "learning_rate": 2.4828941921942535e-06, "loss": 4.1538, "step": 1063 }, { "epoch": 0.841106719367589, "grad_norm": 2.749152898788452, "learning_rate": 2.4819463524399315e-06, "loss": 4.2427, "step": 1064 }, { "epoch": 0.841897233201581, "grad_norm": 2.2317662239074707, "learning_rate": 2.4809978260835666e-06, "loss": 4.1589, "step": 1065 }, { "epoch": 0.8426877470355731, "grad_norm": 2.3791751861572266, "learning_rate": 2.480048613788393e-06, "loss": 4.0668, "step": 1066 }, { "epoch": 0.8434782608695652, "grad_norm": 2.2860095500946045, "learning_rate": 2.479098716218124e-06, "loss": 4.1003, "step": 1067 }, { "epoch": 0.8442687747035573, "grad_norm": 3.244494915008545, "learning_rate": 2.4781481340369526e-06, "loss": 4.2506, "step": 1068 }, { "epoch": 0.8450592885375494, "grad_norm": 2.2821993827819824, "learning_rate": 2.47719686790955e-06, "loss": 3.8511, "step": 1069 }, { "epoch": 0.8458498023715415, "grad_norm": 2.3708531856536865, "learning_rate": 2.476244918501065e-06, "loss": 4.219, "step": 1070 }, { "epoch": 0.8466403162055336, "grad_norm": 2.0365285873413086, "learning_rate": 2.4752922864771273e-06, "loss": 4.2184, "step": 1071 }, { "epoch": 0.8474308300395257, "grad_norm": 2.0950231552124023, "learning_rate": 2.4743389725038394e-06, "loss": 4.2798, "step": 1072 }, { "epoch": 0.8482213438735178, "grad_norm": 2.1740732192993164, "learning_rate": 2.4733849772477838e-06, "loss": 4.0391, "step": 1073 }, { "epoch": 0.8490118577075099, "grad_norm": 2.413074016571045, "learning_rate": 2.4724303013760186e-06, "loss": 3.6222, "step": 1074 }, { "epoch": 0.849802371541502, "grad_norm": 2.363832950592041, "learning_rate": 2.4714749455560767e-06, "loss": 3.9486, "step": 1075 }, { "epoch": 0.850592885375494, "grad_norm": 2.435715913772583, "learning_rate": 2.470518910455968e-06, "loss": 3.8709, "step": 1076 }, { "epoch": 0.8513833992094861, "grad_norm": 2.304642915725708, "learning_rate": 2.4695621967441774e-06, "loss": 3.9442, "step": 1077 }, { "epoch": 0.8521739130434782, "grad_norm": 1.9921302795410156, "learning_rate": 2.4686048050896626e-06, "loss": 4.2773, "step": 1078 }, { "epoch": 0.8529644268774703, "grad_norm": 2.411316156387329, "learning_rate": 2.4676467361618566e-06, "loss": 4.0203, "step": 1079 }, { "epoch": 0.8537549407114624, "grad_norm": 2.092700242996216, "learning_rate": 2.466687990630666e-06, "loss": 4.1488, "step": 1080 }, { "epoch": 0.8545454545454545, "grad_norm": 2.287233829498291, "learning_rate": 2.46572856916647e-06, "loss": 4.0783, "step": 1081 }, { "epoch": 0.8553359683794466, "grad_norm": 2.259087324142456, "learning_rate": 2.464768472440121e-06, "loss": 3.5701, "step": 1082 }, { "epoch": 0.8561264822134388, "grad_norm": 2.1694798469543457, "learning_rate": 2.4638077011229425e-06, "loss": 3.9756, "step": 1083 }, { "epoch": 0.8569169960474309, "grad_norm": 2.478987693786621, "learning_rate": 2.462846255886731e-06, "loss": 3.6092, "step": 1084 }, { "epoch": 0.857707509881423, "grad_norm": 5.114851951599121, "learning_rate": 2.461884137403754e-06, "loss": 4.0698, "step": 1085 }, { "epoch": 0.8584980237154151, "grad_norm": 2.5167739391326904, "learning_rate": 2.4609213463467483e-06, "loss": 3.8358, "step": 1086 }, { "epoch": 0.8592885375494071, "grad_norm": 2.116410493850708, "learning_rate": 2.4599578833889228e-06, "loss": 3.9876, "step": 1087 }, { "epoch": 0.8600790513833992, "grad_norm": 2.182415246963501, "learning_rate": 2.4589937492039555e-06, "loss": 3.9202, "step": 1088 }, { "epoch": 0.8608695652173913, "grad_norm": 2.702667713165283, "learning_rate": 2.458028944465993e-06, "loss": 4.1768, "step": 1089 }, { "epoch": 0.8616600790513834, "grad_norm": 2.3557612895965576, "learning_rate": 2.457063469849653e-06, "loss": 4.0374, "step": 1090 }, { "epoch": 0.8624505928853755, "grad_norm": 2.2317864894866943, "learning_rate": 2.4560973260300188e-06, "loss": 4.0202, "step": 1091 }, { "epoch": 0.8632411067193676, "grad_norm": 2.1775453090667725, "learning_rate": 2.4551305136826425e-06, "loss": 4.0084, "step": 1092 }, { "epoch": 0.8640316205533597, "grad_norm": 2.4135491847991943, "learning_rate": 2.454163033483546e-06, "loss": 3.8222, "step": 1093 }, { "epoch": 0.8648221343873518, "grad_norm": 2.1384360790252686, "learning_rate": 2.4531948861092147e-06, "loss": 4.4165, "step": 1094 }, { "epoch": 0.8656126482213439, "grad_norm": 2.172579526901245, "learning_rate": 2.452226072236602e-06, "loss": 4.0633, "step": 1095 }, { "epoch": 0.866403162055336, "grad_norm": 2.3730623722076416, "learning_rate": 2.451256592543128e-06, "loss": 4.0655, "step": 1096 }, { "epoch": 0.867193675889328, "grad_norm": 2.1374404430389404, "learning_rate": 2.4502864477066776e-06, "loss": 4.1795, "step": 1097 }, { "epoch": 0.8679841897233201, "grad_norm": 2.399367332458496, "learning_rate": 2.4493156384056006e-06, "loss": 3.4109, "step": 1098 }, { "epoch": 0.8687747035573122, "grad_norm": 2.0726394653320312, "learning_rate": 2.4483441653187117e-06, "loss": 4.1871, "step": 1099 }, { "epoch": 0.8695652173913043, "grad_norm": 2.228541612625122, "learning_rate": 2.4473720291252907e-06, "loss": 3.896, "step": 1100 }, { "epoch": 0.8703557312252964, "grad_norm": 2.351048231124878, "learning_rate": 2.4463992305050793e-06, "loss": 4.0282, "step": 1101 }, { "epoch": 0.8711462450592885, "grad_norm": 2.3314104080200195, "learning_rate": 2.445425770138283e-06, "loss": 4.1863, "step": 1102 }, { "epoch": 0.8719367588932806, "grad_norm": 2.1156435012817383, "learning_rate": 2.444451648705571e-06, "loss": 4.1607, "step": 1103 }, { "epoch": 0.8727272727272727, "grad_norm": 2.2093708515167236, "learning_rate": 2.443476866888073e-06, "loss": 4.1841, "step": 1104 }, { "epoch": 0.8735177865612648, "grad_norm": 2.346066951751709, "learning_rate": 2.4425014253673822e-06, "loss": 4.3519, "step": 1105 }, { "epoch": 0.8743083003952569, "grad_norm": 2.698798418045044, "learning_rate": 2.4415253248255524e-06, "loss": 4.1547, "step": 1106 }, { "epoch": 0.8750988142292491, "grad_norm": 2.3776657581329346, "learning_rate": 2.4405485659450974e-06, "loss": 4.5622, "step": 1107 }, { "epoch": 0.8758893280632412, "grad_norm": 2.4340333938598633, "learning_rate": 2.439571149408992e-06, "loss": 4.2106, "step": 1108 }, { "epoch": 0.8766798418972332, "grad_norm": 2.4225847721099854, "learning_rate": 2.438593075900672e-06, "loss": 3.9847, "step": 1109 }, { "epoch": 0.8774703557312253, "grad_norm": 2.0104219913482666, "learning_rate": 2.437614346104031e-06, "loss": 4.3456, "step": 1110 }, { "epoch": 0.8782608695652174, "grad_norm": 2.455273151397705, "learning_rate": 2.436634960703421e-06, "loss": 4.1071, "step": 1111 }, { "epoch": 0.8790513833992095, "grad_norm": 2.041102409362793, "learning_rate": 2.4356549203836536e-06, "loss": 4.4882, "step": 1112 }, { "epoch": 0.8798418972332016, "grad_norm": 2.214477300643921, "learning_rate": 2.4346742258299993e-06, "loss": 3.7846, "step": 1113 }, { "epoch": 0.8806324110671937, "grad_norm": 2.162579298019409, "learning_rate": 2.433692877728184e-06, "loss": 4.0825, "step": 1114 }, { "epoch": 0.8814229249011858, "grad_norm": 2.7644379138946533, "learning_rate": 2.4327108767643914e-06, "loss": 3.8697, "step": 1115 }, { "epoch": 0.8822134387351779, "grad_norm": 2.1582119464874268, "learning_rate": 2.431728223625262e-06, "loss": 4.2578, "step": 1116 }, { "epoch": 0.88300395256917, "grad_norm": 2.127912998199463, "learning_rate": 2.4307449189978916e-06, "loss": 4.1391, "step": 1117 }, { "epoch": 0.8837944664031621, "grad_norm": 2.5911049842834473, "learning_rate": 2.429760963569832e-06, "loss": 4.1295, "step": 1118 }, { "epoch": 0.8845849802371542, "grad_norm": 2.6482467651367188, "learning_rate": 2.428776358029091e-06, "loss": 4.0104, "step": 1119 }, { "epoch": 0.8853754940711462, "grad_norm": 2.250150442123413, "learning_rate": 2.427791103064129e-06, "loss": 4.0695, "step": 1120 }, { "epoch": 0.8861660079051383, "grad_norm": 2.7032039165496826, "learning_rate": 2.4268051993638618e-06, "loss": 3.5309, "step": 1121 }, { "epoch": 0.8869565217391304, "grad_norm": 2.109950065612793, "learning_rate": 2.4258186476176587e-06, "loss": 4.2467, "step": 1122 }, { "epoch": 0.8877470355731225, "grad_norm": 2.1762611865997314, "learning_rate": 2.4248314485153414e-06, "loss": 4.0276, "step": 1123 }, { "epoch": 0.8885375494071146, "grad_norm": 2.0421411991119385, "learning_rate": 2.4238436027471855e-06, "loss": 4.1133, "step": 1124 }, { "epoch": 0.8893280632411067, "grad_norm": 2.3982796669006348, "learning_rate": 2.4228551110039176e-06, "loss": 3.7773, "step": 1125 }, { "epoch": 0.8901185770750988, "grad_norm": 2.3631279468536377, "learning_rate": 2.421865973976717e-06, "loss": 4.0194, "step": 1126 }, { "epoch": 0.8909090909090909, "grad_norm": 2.416090488433838, "learning_rate": 2.420876192357213e-06, "loss": 3.9402, "step": 1127 }, { "epoch": 0.891699604743083, "grad_norm": 2.210829973220825, "learning_rate": 2.4198857668374866e-06, "loss": 3.707, "step": 1128 }, { "epoch": 0.892490118577075, "grad_norm": 1.9773284196853638, "learning_rate": 2.4188946981100684e-06, "loss": 4.3107, "step": 1129 }, { "epoch": 0.8932806324110671, "grad_norm": 2.014040946960449, "learning_rate": 2.4179029868679396e-06, "loss": 4.3441, "step": 1130 }, { "epoch": 0.8940711462450592, "grad_norm": 2.356656551361084, "learning_rate": 2.4169106338045293e-06, "loss": 3.9581, "step": 1131 }, { "epoch": 0.8948616600790514, "grad_norm": 3.0149216651916504, "learning_rate": 2.4159176396137166e-06, "loss": 3.9237, "step": 1132 }, { "epoch": 0.8956521739130435, "grad_norm": 2.346492052078247, "learning_rate": 2.414924004989829e-06, "loss": 3.6676, "step": 1133 }, { "epoch": 0.8964426877470356, "grad_norm": 2.2715091705322266, "learning_rate": 2.4139297306276408e-06, "loss": 3.8287, "step": 1134 }, { "epoch": 0.8972332015810277, "grad_norm": 3.8307108879089355, "learning_rate": 2.412934817222374e-06, "loss": 3.885, "step": 1135 }, { "epoch": 0.8980237154150198, "grad_norm": 2.634117364883423, "learning_rate": 2.4119392654696977e-06, "loss": 3.8175, "step": 1136 }, { "epoch": 0.8988142292490119, "grad_norm": 2.381533622741699, "learning_rate": 2.410943076065727e-06, "loss": 4.0366, "step": 1137 }, { "epoch": 0.899604743083004, "grad_norm": 2.331939220428467, "learning_rate": 2.409946249707023e-06, "loss": 3.8959, "step": 1138 }, { "epoch": 0.9003952569169961, "grad_norm": 2.3123679161071777, "learning_rate": 2.4089487870905927e-06, "loss": 3.8897, "step": 1139 }, { "epoch": 0.9011857707509882, "grad_norm": 2.1424527168273926, "learning_rate": 2.407950688913887e-06, "loss": 3.9378, "step": 1140 }, { "epoch": 0.9019762845849802, "grad_norm": 2.3447368144989014, "learning_rate": 2.406951955874802e-06, "loss": 3.6618, "step": 1141 }, { "epoch": 0.9027667984189723, "grad_norm": 2.097870349884033, "learning_rate": 2.4059525886716775e-06, "loss": 3.8166, "step": 1142 }, { "epoch": 0.9035573122529644, "grad_norm": 2.243609666824341, "learning_rate": 2.4049525880032965e-06, "loss": 3.9667, "step": 1143 }, { "epoch": 0.9043478260869565, "grad_norm": 2.384399890899658, "learning_rate": 2.403951954568885e-06, "loss": 3.9214, "step": 1144 }, { "epoch": 0.9051383399209486, "grad_norm": 2.103300094604492, "learning_rate": 2.4029506890681115e-06, "loss": 4.0513, "step": 1145 }, { "epoch": 0.9059288537549407, "grad_norm": 2.218987464904785, "learning_rate": 2.4019487922010863e-06, "loss": 4.0312, "step": 1146 }, { "epoch": 0.9067193675889328, "grad_norm": 2.224579095840454, "learning_rate": 2.4009462646683622e-06, "loss": 4.0921, "step": 1147 }, { "epoch": 0.9075098814229249, "grad_norm": 2.3234472274780273, "learning_rate": 2.3999431071709315e-06, "loss": 3.9442, "step": 1148 }, { "epoch": 0.908300395256917, "grad_norm": 2.5314035415649414, "learning_rate": 2.3989393204102273e-06, "loss": 4.391, "step": 1149 }, { "epoch": 0.9090909090909091, "grad_norm": 2.9518637657165527, "learning_rate": 2.3979349050881234e-06, "loss": 3.5999, "step": 1150 }, { "epoch": 0.9098814229249012, "grad_norm": 2.0601985454559326, "learning_rate": 2.3969298619069335e-06, "loss": 4.2404, "step": 1151 }, { "epoch": 0.9106719367588932, "grad_norm": 2.3019211292266846, "learning_rate": 2.395924191569408e-06, "loss": 3.9644, "step": 1152 }, { "epoch": 0.9114624505928853, "grad_norm": 2.4942526817321777, "learning_rate": 2.3949178947787384e-06, "loss": 4.3815, "step": 1153 }, { "epoch": 0.9122529644268774, "grad_norm": 2.273003578186035, "learning_rate": 2.3939109722385533e-06, "loss": 3.8374, "step": 1154 }, { "epoch": 0.9130434782608695, "grad_norm": 2.189864158630371, "learning_rate": 2.392903424652918e-06, "loss": 4.0908, "step": 1155 }, { "epoch": 0.9138339920948617, "grad_norm": 2.2084333896636963, "learning_rate": 2.391895252726336e-06, "loss": 4.0738, "step": 1156 }, { "epoch": 0.9146245059288538, "grad_norm": 2.4546892642974854, "learning_rate": 2.390886457163747e-06, "loss": 3.9554, "step": 1157 }, { "epoch": 0.9154150197628459, "grad_norm": 3.5218849182128906, "learning_rate": 2.3898770386705262e-06, "loss": 4.2479, "step": 1158 }, { "epoch": 0.916205533596838, "grad_norm": 2.070537805557251, "learning_rate": 2.3888669979524857e-06, "loss": 4.2474, "step": 1159 }, { "epoch": 0.9169960474308301, "grad_norm": 2.243116855621338, "learning_rate": 2.387856335715871e-06, "loss": 4.2875, "step": 1160 }, { "epoch": 0.9177865612648222, "grad_norm": 2.302180290222168, "learning_rate": 2.386845052667364e-06, "loss": 3.9264, "step": 1161 }, { "epoch": 0.9185770750988143, "grad_norm": 2.1883704662323, "learning_rate": 2.3858331495140782e-06, "loss": 4.2449, "step": 1162 }, { "epoch": 0.9193675889328063, "grad_norm": 2.0650179386138916, "learning_rate": 2.3848206269635636e-06, "loss": 4.0829, "step": 1163 }, { "epoch": 0.9201581027667984, "grad_norm": 2.3316683769226074, "learning_rate": 2.3838074857238014e-06, "loss": 3.9344, "step": 1164 }, { "epoch": 0.9209486166007905, "grad_norm": 2.3640189170837402, "learning_rate": 2.382793726503206e-06, "loss": 3.9224, "step": 1165 }, { "epoch": 0.9217391304347826, "grad_norm": 2.8492112159729004, "learning_rate": 2.3817793500106235e-06, "loss": 3.9805, "step": 1166 }, { "epoch": 0.9225296442687747, "grad_norm": 2.4240188598632812, "learning_rate": 2.380764356955333e-06, "loss": 4.1052, "step": 1167 }, { "epoch": 0.9233201581027668, "grad_norm": 2.1547110080718994, "learning_rate": 2.379748748047042e-06, "loss": 3.9329, "step": 1168 }, { "epoch": 0.9241106719367589, "grad_norm": 1.9924272298812866, "learning_rate": 2.3787325239958915e-06, "loss": 4.3603, "step": 1169 }, { "epoch": 0.924901185770751, "grad_norm": 2.1101722717285156, "learning_rate": 2.377715685512451e-06, "loss": 4.1528, "step": 1170 }, { "epoch": 0.9256916996047431, "grad_norm": 2.1988637447357178, "learning_rate": 2.3766982333077198e-06, "loss": 3.7344, "step": 1171 }, { "epoch": 0.9264822134387352, "grad_norm": 1.9965111017227173, "learning_rate": 2.3756801680931262e-06, "loss": 4.5237, "step": 1172 }, { "epoch": 0.9272727272727272, "grad_norm": 2.0128743648529053, "learning_rate": 2.3746614905805286e-06, "loss": 4.4064, "step": 1173 }, { "epoch": 0.9280632411067193, "grad_norm": 2.950958728790283, "learning_rate": 2.3736422014822117e-06, "loss": 3.7756, "step": 1174 }, { "epoch": 0.9288537549407114, "grad_norm": 2.7156784534454346, "learning_rate": 2.372622301510889e-06, "loss": 3.7952, "step": 1175 }, { "epoch": 0.9296442687747035, "grad_norm": 2.280895233154297, "learning_rate": 2.3716017913797e-06, "loss": 4.0363, "step": 1176 }, { "epoch": 0.9304347826086956, "grad_norm": 2.3565969467163086, "learning_rate": 2.3705806718022123e-06, "loss": 3.8037, "step": 1177 }, { "epoch": 0.9312252964426877, "grad_norm": 2.7572286128997803, "learning_rate": 2.369558943492418e-06, "loss": 3.8943, "step": 1178 }, { "epoch": 0.9320158102766798, "grad_norm": 2.1680643558502197, "learning_rate": 2.3685366071647362e-06, "loss": 4.1109, "step": 1179 }, { "epoch": 0.932806324110672, "grad_norm": 2.138692617416382, "learning_rate": 2.3675136635340105e-06, "loss": 4.2431, "step": 1180 }, { "epoch": 0.9335968379446641, "grad_norm": 2.341008424758911, "learning_rate": 2.3664901133155098e-06, "loss": 3.8941, "step": 1181 }, { "epoch": 0.9343873517786562, "grad_norm": 2.0385186672210693, "learning_rate": 2.365465957224926e-06, "loss": 4.1908, "step": 1182 }, { "epoch": 0.9351778656126483, "grad_norm": 2.3826184272766113, "learning_rate": 2.3644411959783754e-06, "loss": 4.1639, "step": 1183 }, { "epoch": 0.9359683794466404, "grad_norm": 1.97429621219635, "learning_rate": 2.3634158302923976e-06, "loss": 4.4504, "step": 1184 }, { "epoch": 0.9367588932806324, "grad_norm": 2.214435577392578, "learning_rate": 2.3623898608839544e-06, "loss": 3.5951, "step": 1185 }, { "epoch": 0.9375494071146245, "grad_norm": 2.135784149169922, "learning_rate": 2.3613632884704296e-06, "loss": 4.06, "step": 1186 }, { "epoch": 0.9383399209486166, "grad_norm": 2.3517794609069824, "learning_rate": 2.3603361137696295e-06, "loss": 3.9003, "step": 1187 }, { "epoch": 0.9391304347826087, "grad_norm": 2.769784927368164, "learning_rate": 2.35930833749978e-06, "loss": 3.8246, "step": 1188 }, { "epoch": 0.9399209486166008, "grad_norm": 2.1810176372528076, "learning_rate": 2.3582799603795297e-06, "loss": 4.0916, "step": 1189 }, { "epoch": 0.9407114624505929, "grad_norm": 2.279831886291504, "learning_rate": 2.3572509831279456e-06, "loss": 3.8865, "step": 1190 }, { "epoch": 0.941501976284585, "grad_norm": 2.1091346740722656, "learning_rate": 2.3562214064645153e-06, "loss": 3.9119, "step": 1191 }, { "epoch": 0.9422924901185771, "grad_norm": 2.1809732913970947, "learning_rate": 2.355191231109145e-06, "loss": 4.1318, "step": 1192 }, { "epoch": 0.9430830039525692, "grad_norm": 2.153026580810547, "learning_rate": 2.35416045778216e-06, "loss": 3.7112, "step": 1193 }, { "epoch": 0.9438735177865613, "grad_norm": 2.586646795272827, "learning_rate": 2.3531290872043024e-06, "loss": 4.1347, "step": 1194 }, { "epoch": 0.9446640316205533, "grad_norm": 2.426215410232544, "learning_rate": 2.3520971200967337e-06, "loss": 4.1185, "step": 1195 }, { "epoch": 0.9454545454545454, "grad_norm": 2.5409762859344482, "learning_rate": 2.351064557181032e-06, "loss": 4.1351, "step": 1196 }, { "epoch": 0.9462450592885375, "grad_norm": 2.313770055770874, "learning_rate": 2.3500313991791915e-06, "loss": 4.2378, "step": 1197 }, { "epoch": 0.9470355731225296, "grad_norm": 2.222620964050293, "learning_rate": 2.348997646813622e-06, "loss": 3.9904, "step": 1198 }, { "epoch": 0.9478260869565217, "grad_norm": 2.1550450325012207, "learning_rate": 2.347963300807151e-06, "loss": 3.9395, "step": 1199 }, { "epoch": 0.9486166007905138, "grad_norm": 2.3107638359069824, "learning_rate": 2.346928361883019e-06, "loss": 4.2639, "step": 1200 }, { "epoch": 0.9494071146245059, "grad_norm": 2.0839531421661377, "learning_rate": 2.3458928307648812e-06, "loss": 4.4029, "step": 1201 }, { "epoch": 0.950197628458498, "grad_norm": 2.2067761421203613, "learning_rate": 2.3448567081768087e-06, "loss": 4.5848, "step": 1202 }, { "epoch": 0.9509881422924901, "grad_norm": 2.1528453826904297, "learning_rate": 2.343819994843284e-06, "loss": 4.4279, "step": 1203 }, { "epoch": 0.9517786561264822, "grad_norm": 2.0640525817871094, "learning_rate": 2.3427826914892037e-06, "loss": 4.2429, "step": 1204 }, { "epoch": 0.9525691699604744, "grad_norm": 2.2407965660095215, "learning_rate": 2.3417447988398777e-06, "loss": 4.1202, "step": 1205 }, { "epoch": 0.9533596837944665, "grad_norm": 3.2361183166503906, "learning_rate": 2.3407063176210266e-06, "loss": 4.142, "step": 1206 }, { "epoch": 0.9541501976284585, "grad_norm": 2.049021005630493, "learning_rate": 2.3396672485587826e-06, "loss": 4.192, "step": 1207 }, { "epoch": 0.9549407114624506, "grad_norm": 2.265150785446167, "learning_rate": 2.3386275923796902e-06, "loss": 3.9385, "step": 1208 }, { "epoch": 0.9557312252964427, "grad_norm": 2.330951452255249, "learning_rate": 2.337587349810703e-06, "loss": 3.9202, "step": 1209 }, { "epoch": 0.9565217391304348, "grad_norm": 2.301232099533081, "learning_rate": 2.3365465215791855e-06, "loss": 3.9511, "step": 1210 }, { "epoch": 0.9573122529644269, "grad_norm": 2.3812618255615234, "learning_rate": 2.3355051084129114e-06, "loss": 3.834, "step": 1211 }, { "epoch": 0.958102766798419, "grad_norm": 2.0690858364105225, "learning_rate": 2.334463111040064e-06, "loss": 3.7902, "step": 1212 }, { "epoch": 0.9588932806324111, "grad_norm": 2.5348165035247803, "learning_rate": 2.333420530189234e-06, "loss": 3.9806, "step": 1213 }, { "epoch": 0.9596837944664032, "grad_norm": 2.462832450866699, "learning_rate": 2.3323773665894207e-06, "loss": 3.9045, "step": 1214 }, { "epoch": 0.9604743083003953, "grad_norm": 4.105721950531006, "learning_rate": 2.3313336209700306e-06, "loss": 4.0302, "step": 1215 }, { "epoch": 0.9612648221343874, "grad_norm": 2.292180299758911, "learning_rate": 2.330289294060878e-06, "loss": 4.0007, "step": 1216 }, { "epoch": 0.9620553359683794, "grad_norm": 2.2330751419067383, "learning_rate": 2.3292443865921823e-06, "loss": 4.1977, "step": 1217 }, { "epoch": 0.9628458498023715, "grad_norm": 2.0693352222442627, "learning_rate": 2.32819889929457e-06, "loss": 4.0539, "step": 1218 }, { "epoch": 0.9636363636363636, "grad_norm": 2.2685492038726807, "learning_rate": 2.3271528328990725e-06, "loss": 3.5546, "step": 1219 }, { "epoch": 0.9644268774703557, "grad_norm": 2.302929401397705, "learning_rate": 2.3261061881371262e-06, "loss": 3.6277, "step": 1220 }, { "epoch": 0.9652173913043478, "grad_norm": 2.3870527744293213, "learning_rate": 2.325058965740572e-06, "loss": 3.8417, "step": 1221 }, { "epoch": 0.9660079051383399, "grad_norm": 2.428403377532959, "learning_rate": 2.3240111664416546e-06, "loss": 4.0495, "step": 1222 }, { "epoch": 0.966798418972332, "grad_norm": 2.0032477378845215, "learning_rate": 2.3229627909730223e-06, "loss": 4.2921, "step": 1223 }, { "epoch": 0.9675889328063241, "grad_norm": 2.5847651958465576, "learning_rate": 2.321913840067726e-06, "loss": 3.9141, "step": 1224 }, { "epoch": 0.9683794466403162, "grad_norm": 2.072962522506714, "learning_rate": 2.3208643144592185e-06, "loss": 4.0663, "step": 1225 }, { "epoch": 0.9691699604743083, "grad_norm": 2.078094959259033, "learning_rate": 2.319814214881356e-06, "loss": 4.1254, "step": 1226 }, { "epoch": 0.9699604743083003, "grad_norm": 2.1655216217041016, "learning_rate": 2.3187635420683935e-06, "loss": 4.1573, "step": 1227 }, { "epoch": 0.9707509881422924, "grad_norm": 2.0662686824798584, "learning_rate": 2.3177122967549897e-06, "loss": 4.3534, "step": 1228 }, { "epoch": 0.9715415019762846, "grad_norm": 4.033877372741699, "learning_rate": 2.3166604796762017e-06, "loss": 4.049, "step": 1229 }, { "epoch": 0.9723320158102767, "grad_norm": 2.503960132598877, "learning_rate": 2.3156080915674876e-06, "loss": 3.7278, "step": 1230 }, { "epoch": 0.9731225296442688, "grad_norm": 2.1854188442230225, "learning_rate": 2.314555133164703e-06, "loss": 4.1235, "step": 1231 }, { "epoch": 0.9739130434782609, "grad_norm": 2.3344779014587402, "learning_rate": 2.3135016052041046e-06, "loss": 3.6916, "step": 1232 }, { "epoch": 0.974703557312253, "grad_norm": 2.103050470352173, "learning_rate": 2.3124475084223455e-06, "loss": 4.1903, "step": 1233 }, { "epoch": 0.9754940711462451, "grad_norm": 2.3171613216400146, "learning_rate": 2.3113928435564765e-06, "loss": 4.0165, "step": 1234 }, { "epoch": 0.9762845849802372, "grad_norm": 1.9697836637496948, "learning_rate": 2.3103376113439474e-06, "loss": 4.0877, "step": 1235 }, { "epoch": 0.9770750988142293, "grad_norm": 2.271580457687378, "learning_rate": 2.3092818125226034e-06, "loss": 4.033, "step": 1236 }, { "epoch": 0.9778656126482214, "grad_norm": 2.015069007873535, "learning_rate": 2.3082254478306855e-06, "loss": 4.3738, "step": 1237 }, { "epoch": 0.9786561264822135, "grad_norm": 2.1734602451324463, "learning_rate": 2.3071685180068314e-06, "loss": 4.0599, "step": 1238 }, { "epoch": 0.9794466403162055, "grad_norm": 2.4295225143432617, "learning_rate": 2.3061110237900735e-06, "loss": 3.8722, "step": 1239 }, { "epoch": 0.9802371541501976, "grad_norm": 2.30010724067688, "learning_rate": 2.3050529659198385e-06, "loss": 4.0674, "step": 1240 }, { "epoch": 0.9810276679841897, "grad_norm": 2.0479369163513184, "learning_rate": 2.303994345135948e-06, "loss": 4.2303, "step": 1241 }, { "epoch": 0.9818181818181818, "grad_norm": 2.3217289447784424, "learning_rate": 2.302935162178617e-06, "loss": 4.0136, "step": 1242 }, { "epoch": 0.9826086956521739, "grad_norm": 2.719766616821289, "learning_rate": 2.301875417788452e-06, "loss": 3.8013, "step": 1243 }, { "epoch": 0.983399209486166, "grad_norm": 2.2762844562530518, "learning_rate": 2.3008151127064554e-06, "loss": 3.3472, "step": 1244 }, { "epoch": 0.9841897233201581, "grad_norm": 2.3751533031463623, "learning_rate": 2.299754247674018e-06, "loss": 3.8452, "step": 1245 }, { "epoch": 0.9849802371541502, "grad_norm": 1.9641880989074707, "learning_rate": 2.298692823432925e-06, "loss": 4.2692, "step": 1246 }, { "epoch": 0.9857707509881423, "grad_norm": 2.3364288806915283, "learning_rate": 2.29763084072535e-06, "loss": 3.9468, "step": 1247 }, { "epoch": 0.9865612648221344, "grad_norm": 2.5069897174835205, "learning_rate": 2.2965683002938602e-06, "loss": 3.9164, "step": 1248 }, { "epoch": 0.9873517786561264, "grad_norm": 2.107379674911499, "learning_rate": 2.29550520288141e-06, "loss": 4.1158, "step": 1249 }, { "epoch": 0.9881422924901185, "grad_norm": 2.1871888637542725, "learning_rate": 2.2944415492313444e-06, "loss": 4.0677, "step": 1250 }, { "epoch": 0.9889328063241106, "grad_norm": 3.0084242820739746, "learning_rate": 2.293377340087398e-06, "loss": 3.9037, "step": 1251 }, { "epoch": 0.9897233201581027, "grad_norm": 2.019631862640381, "learning_rate": 2.2923125761936927e-06, "loss": 4.2714, "step": 1252 }, { "epoch": 0.9905138339920949, "grad_norm": 2.2982358932495117, "learning_rate": 2.291247258294738e-06, "loss": 3.84, "step": 1253 }, { "epoch": 0.991304347826087, "grad_norm": 2.5358152389526367, "learning_rate": 2.290181387135432e-06, "loss": 4.0927, "step": 1254 }, { "epoch": 0.9920948616600791, "grad_norm": 2.377375364303589, "learning_rate": 2.2891149634610595e-06, "loss": 3.8409, "step": 1255 }, { "epoch": 0.9928853754940712, "grad_norm": 2.9822185039520264, "learning_rate": 2.288047988017291e-06, "loss": 3.9354, "step": 1256 }, { "epoch": 0.9936758893280633, "grad_norm": 2.3046398162841797, "learning_rate": 2.2869804615501827e-06, "loss": 3.6414, "step": 1257 }, { "epoch": 0.9944664031620554, "grad_norm": 2.473414897918701, "learning_rate": 2.2859123848061768e-06, "loss": 4.0874, "step": 1258 }, { "epoch": 0.9952569169960475, "grad_norm": 2.0642971992492676, "learning_rate": 2.2848437585320996e-06, "loss": 4.2432, "step": 1259 }, { "epoch": 0.9960474308300395, "grad_norm": 2.169325590133667, "learning_rate": 2.2837745834751625e-06, "loss": 4.304, "step": 1260 }, { "epoch": 0.9968379446640316, "grad_norm": 2.1323187351226807, "learning_rate": 2.2827048603829597e-06, "loss": 4.1294, "step": 1261 }, { "epoch": 0.9976284584980237, "grad_norm": 2.3951430320739746, "learning_rate": 2.281634590003469e-06, "loss": 3.8919, "step": 1262 }, { "epoch": 0.9984189723320158, "grad_norm": 2.068286895751953, "learning_rate": 2.2805637730850514e-06, "loss": 4.2021, "step": 1263 }, { "epoch": 0.9992094861660079, "grad_norm": 2.365623950958252, "learning_rate": 2.2794924103764487e-06, "loss": 3.9916, "step": 1264 }, { "epoch": 1.0, "grad_norm": 2.0748913288116455, "learning_rate": 2.2784205026267854e-06, "loss": 4.1339, "step": 1265 }, { "epoch": 1.000790513833992, "grad_norm": 2.1420412063598633, "learning_rate": 2.277348050585567e-06, "loss": 4.1001, "step": 1266 }, { "epoch": 1.0015810276679842, "grad_norm": 3.18330979347229, "learning_rate": 2.2762750550026795e-06, "loss": 3.5574, "step": 1267 }, { "epoch": 1.0023715415019763, "grad_norm": 2.2610347270965576, "learning_rate": 2.275201516628389e-06, "loss": 3.7837, "step": 1268 }, { "epoch": 1.0031620553359684, "grad_norm": 2.1156516075134277, "learning_rate": 2.2741274362133405e-06, "loss": 3.9219, "step": 1269 }, { "epoch": 1.0039525691699605, "grad_norm": 2.133855104446411, "learning_rate": 2.2730528145085586e-06, "loss": 4.178, "step": 1270 }, { "epoch": 1.0047430830039525, "grad_norm": 2.0864462852478027, "learning_rate": 2.2719776522654473e-06, "loss": 4.1238, "step": 1271 }, { "epoch": 1.0055335968379446, "grad_norm": 2.475883722305298, "learning_rate": 2.2709019502357867e-06, "loss": 3.9748, "step": 1272 }, { "epoch": 1.0063241106719367, "grad_norm": 2.428999423980713, "learning_rate": 2.2698257091717353e-06, "loss": 4.0822, "step": 1273 }, { "epoch": 1.0071146245059288, "grad_norm": 2.2497575283050537, "learning_rate": 2.2687489298258282e-06, "loss": 4.0585, "step": 1274 }, { "epoch": 1.007905138339921, "grad_norm": 2.1769533157348633, "learning_rate": 2.267671612950978e-06, "loss": 3.9567, "step": 1275 }, { "epoch": 1.008695652173913, "grad_norm": 3.103576421737671, "learning_rate": 2.266593759300471e-06, "loss": 3.7752, "step": 1276 }, { "epoch": 1.009486166007905, "grad_norm": 2.768284559249878, "learning_rate": 2.2655153696279715e-06, "loss": 3.9377, "step": 1277 }, { "epoch": 1.0102766798418972, "grad_norm": 2.4949209690093994, "learning_rate": 2.264436444687516e-06, "loss": 4.0641, "step": 1278 }, { "epoch": 1.0110671936758893, "grad_norm": 2.17779541015625, "learning_rate": 2.263356985233517e-06, "loss": 3.9872, "step": 1279 }, { "epoch": 1.0118577075098814, "grad_norm": 2.383943796157837, "learning_rate": 2.2622769920207602e-06, "loss": 3.7829, "step": 1280 }, { "epoch": 1.0126482213438734, "grad_norm": 1.9330775737762451, "learning_rate": 2.261196465804404e-06, "loss": 4.2111, "step": 1281 }, { "epoch": 1.0134387351778655, "grad_norm": 2.1417415142059326, "learning_rate": 2.2601154073399807e-06, "loss": 3.9774, "step": 1282 }, { "epoch": 1.0142292490118576, "grad_norm": 2.1246304512023926, "learning_rate": 2.259033817383394e-06, "loss": 4.2305, "step": 1283 }, { "epoch": 1.0150197628458497, "grad_norm": 2.032921552658081, "learning_rate": 2.2579516966909188e-06, "loss": 4.1768, "step": 1284 }, { "epoch": 1.0158102766798418, "grad_norm": 2.395545721054077, "learning_rate": 2.256869046019202e-06, "loss": 3.9964, "step": 1285 }, { "epoch": 1.016600790513834, "grad_norm": 2.206862449645996, "learning_rate": 2.2557858661252603e-06, "loss": 3.8759, "step": 1286 }, { "epoch": 1.017391304347826, "grad_norm": 2.9150352478027344, "learning_rate": 2.2547021577664817e-06, "loss": 4.2546, "step": 1287 }, { "epoch": 1.018181818181818, "grad_norm": 2.173308849334717, "learning_rate": 2.253617921700621e-06, "loss": 3.952, "step": 1288 }, { "epoch": 1.0189723320158102, "grad_norm": 2.1344046592712402, "learning_rate": 2.2525331586858064e-06, "loss": 3.6527, "step": 1289 }, { "epoch": 1.0197628458498025, "grad_norm": 2.317352533340454, "learning_rate": 2.2514478694805303e-06, "loss": 4.1446, "step": 1290 }, { "epoch": 1.0205533596837946, "grad_norm": 2.1856250762939453, "learning_rate": 2.250362054843655e-06, "loss": 4.2199, "step": 1291 }, { "epoch": 1.0213438735177867, "grad_norm": 2.5235278606414795, "learning_rate": 2.24927571553441e-06, "loss": 3.9755, "step": 1292 }, { "epoch": 1.0221343873517788, "grad_norm": 2.177804708480835, "learning_rate": 2.248188852312392e-06, "loss": 3.8587, "step": 1293 }, { "epoch": 1.0229249011857708, "grad_norm": 2.0257105827331543, "learning_rate": 2.2471014659375627e-06, "loss": 3.998, "step": 1294 }, { "epoch": 1.023715415019763, "grad_norm": 2.430162191390991, "learning_rate": 2.2460135571702508e-06, "loss": 3.6123, "step": 1295 }, { "epoch": 1.024505928853755, "grad_norm": 1.9563958644866943, "learning_rate": 2.2449251267711504e-06, "loss": 4.1014, "step": 1296 }, { "epoch": 1.0252964426877471, "grad_norm": 2.366849422454834, "learning_rate": 2.24383617550132e-06, "loss": 3.8877, "step": 1297 }, { "epoch": 1.0260869565217392, "grad_norm": 2.1021549701690674, "learning_rate": 2.2427467041221813e-06, "loss": 3.8153, "step": 1298 }, { "epoch": 1.0268774703557313, "grad_norm": 2.154458522796631, "learning_rate": 2.2416567133955217e-06, "loss": 4.0958, "step": 1299 }, { "epoch": 1.0276679841897234, "grad_norm": 2.2397406101226807, "learning_rate": 2.24056620408349e-06, "loss": 3.763, "step": 1300 }, { "epoch": 1.0284584980237155, "grad_norm": 2.048098564147949, "learning_rate": 2.2394751769485978e-06, "loss": 4.3417, "step": 1301 }, { "epoch": 1.0292490118577076, "grad_norm": 2.1723663806915283, "learning_rate": 2.2383836327537204e-06, "loss": 4.1235, "step": 1302 }, { "epoch": 1.0300395256916997, "grad_norm": 2.0826258659362793, "learning_rate": 2.237291572262093e-06, "loss": 4.213, "step": 1303 }, { "epoch": 1.0308300395256917, "grad_norm": 2.085358142852783, "learning_rate": 2.2361989962373116e-06, "loss": 4.038, "step": 1304 }, { "epoch": 1.0316205533596838, "grad_norm": 2.0704662799835205, "learning_rate": 2.2351059054433342e-06, "loss": 4.1084, "step": 1305 }, { "epoch": 1.032411067193676, "grad_norm": 1.9360425472259521, "learning_rate": 2.234012300644477e-06, "loss": 4.3052, "step": 1306 }, { "epoch": 1.033201581027668, "grad_norm": 2.4654064178466797, "learning_rate": 2.2329181826054176e-06, "loss": 3.8517, "step": 1307 }, { "epoch": 1.03399209486166, "grad_norm": 1.9376568794250488, "learning_rate": 2.2318235520911906e-06, "loss": 4.3347, "step": 1308 }, { "epoch": 1.0347826086956522, "grad_norm": 2.231109619140625, "learning_rate": 2.2307284098671903e-06, "loss": 3.8715, "step": 1309 }, { "epoch": 1.0355731225296443, "grad_norm": 1.9418078660964966, "learning_rate": 2.2296327566991673e-06, "loss": 4.0817, "step": 1310 }, { "epoch": 1.0363636363636364, "grad_norm": 2.0864155292510986, "learning_rate": 2.2285365933532305e-06, "loss": 4.2537, "step": 1311 }, { "epoch": 1.0371541501976285, "grad_norm": 2.0682952404022217, "learning_rate": 2.2274399205958466e-06, "loss": 4.0731, "step": 1312 }, { "epoch": 1.0379446640316206, "grad_norm": 2.1476335525512695, "learning_rate": 2.226342739193836e-06, "loss": 4.0801, "step": 1313 }, { "epoch": 1.0387351778656126, "grad_norm": 2.951202154159546, "learning_rate": 2.225245049914377e-06, "loss": 3.8201, "step": 1314 }, { "epoch": 1.0395256916996047, "grad_norm": 2.0945374965667725, "learning_rate": 2.224146853525001e-06, "loss": 4.3766, "step": 1315 }, { "epoch": 1.0403162055335968, "grad_norm": 2.1866776943206787, "learning_rate": 2.2230481507935954e-06, "loss": 4.1357, "step": 1316 }, { "epoch": 1.041106719367589, "grad_norm": 2.3060836791992188, "learning_rate": 2.221948942488401e-06, "loss": 4.0274, "step": 1317 }, { "epoch": 1.041897233201581, "grad_norm": 2.1887152194976807, "learning_rate": 2.220849229378014e-06, "loss": 3.8503, "step": 1318 }, { "epoch": 1.042687747035573, "grad_norm": 2.1214370727539062, "learning_rate": 2.2197490122313806e-06, "loss": 3.8618, "step": 1319 }, { "epoch": 1.0434782608695652, "grad_norm": 2.2305328845977783, "learning_rate": 2.2186482918178008e-06, "loss": 4.0019, "step": 1320 }, { "epoch": 1.0442687747035573, "grad_norm": 2.206160306930542, "learning_rate": 2.2175470689069267e-06, "loss": 3.942, "step": 1321 }, { "epoch": 1.0450592885375494, "grad_norm": 2.2406437397003174, "learning_rate": 2.216445344268762e-06, "loss": 3.8142, "step": 1322 }, { "epoch": 1.0458498023715415, "grad_norm": 2.6001412868499756, "learning_rate": 2.21534311867366e-06, "loss": 4.1058, "step": 1323 }, { "epoch": 1.0466403162055335, "grad_norm": 2.232164144515991, "learning_rate": 2.214240392892326e-06, "loss": 4.1514, "step": 1324 }, { "epoch": 1.0474308300395256, "grad_norm": 2.2514429092407227, "learning_rate": 2.213137167695813e-06, "loss": 4.0446, "step": 1325 }, { "epoch": 1.0482213438735177, "grad_norm": 2.1762301921844482, "learning_rate": 2.2120334438555253e-06, "loss": 4.0768, "step": 1326 }, { "epoch": 1.0490118577075098, "grad_norm": 2.099612236022949, "learning_rate": 2.210929222143214e-06, "loss": 4.4282, "step": 1327 }, { "epoch": 1.049802371541502, "grad_norm": 2.2732012271881104, "learning_rate": 2.2098245033309803e-06, "loss": 4.2003, "step": 1328 }, { "epoch": 1.050592885375494, "grad_norm": 2.351555824279785, "learning_rate": 2.2087192881912713e-06, "loss": 3.8742, "step": 1329 }, { "epoch": 1.051383399209486, "grad_norm": 2.1276140213012695, "learning_rate": 2.2076135774968812e-06, "loss": 4.1109, "step": 1330 }, { "epoch": 1.0521739130434782, "grad_norm": 2.3467094898223877, "learning_rate": 2.2065073720209522e-06, "loss": 3.5048, "step": 1331 }, { "epoch": 1.0529644268774703, "grad_norm": 2.3851447105407715, "learning_rate": 2.205400672536971e-06, "loss": 4.2387, "step": 1332 }, { "epoch": 1.0537549407114624, "grad_norm": 2.16373348236084, "learning_rate": 2.20429347981877e-06, "loss": 4.0525, "step": 1333 }, { "epoch": 1.0545454545454545, "grad_norm": 2.212601900100708, "learning_rate": 2.203185794640528e-06, "loss": 3.6838, "step": 1334 }, { "epoch": 1.0553359683794465, "grad_norm": 2.3070015907287598, "learning_rate": 2.202077617776765e-06, "loss": 4.1222, "step": 1335 }, { "epoch": 1.0561264822134386, "grad_norm": 2.0389814376831055, "learning_rate": 2.200968950002348e-06, "loss": 4.0829, "step": 1336 }, { "epoch": 1.0569169960474307, "grad_norm": 2.940701723098755, "learning_rate": 2.1998597920924847e-06, "loss": 3.8952, "step": 1337 }, { "epoch": 1.057707509881423, "grad_norm": 2.7851901054382324, "learning_rate": 2.198750144822728e-06, "loss": 4.2265, "step": 1338 }, { "epoch": 1.0584980237154151, "grad_norm": 2.241931438446045, "learning_rate": 2.197640008968971e-06, "loss": 4.1936, "step": 1339 }, { "epoch": 1.0592885375494072, "grad_norm": 2.2362005710601807, "learning_rate": 2.1965293853074495e-06, "loss": 4.0076, "step": 1340 }, { "epoch": 1.0600790513833993, "grad_norm": 2.3556628227233887, "learning_rate": 2.1954182746147394e-06, "loss": 3.7897, "step": 1341 }, { "epoch": 1.0608695652173914, "grad_norm": 2.2612974643707275, "learning_rate": 2.194306677667759e-06, "loss": 4.013, "step": 1342 }, { "epoch": 1.0616600790513835, "grad_norm": 2.6064469814300537, "learning_rate": 2.1931945952437634e-06, "loss": 3.3859, "step": 1343 }, { "epoch": 1.0624505928853756, "grad_norm": 2.301410675048828, "learning_rate": 2.192082028120351e-06, "loss": 3.97, "step": 1344 }, { "epoch": 1.0632411067193677, "grad_norm": 2.2862329483032227, "learning_rate": 2.190968977075456e-06, "loss": 4.2722, "step": 1345 }, { "epoch": 1.0640316205533598, "grad_norm": 2.641251802444458, "learning_rate": 2.189855442887353e-06, "loss": 4.1968, "step": 1346 }, { "epoch": 1.0648221343873518, "grad_norm": 3.132054328918457, "learning_rate": 2.1887414263346526e-06, "loss": 3.8425, "step": 1347 }, { "epoch": 1.065612648221344, "grad_norm": 2.312709093093872, "learning_rate": 2.1876269281963053e-06, "loss": 4.1744, "step": 1348 }, { "epoch": 1.066403162055336, "grad_norm": 2.5211262702941895, "learning_rate": 2.1865119492515952e-06, "loss": 3.9143, "step": 1349 }, { "epoch": 1.0671936758893281, "grad_norm": 2.0946109294891357, "learning_rate": 2.185396490280145e-06, "loss": 3.9023, "step": 1350 }, { "epoch": 1.0679841897233202, "grad_norm": 1.9249626398086548, "learning_rate": 2.1842805520619124e-06, "loss": 4.4834, "step": 1351 }, { "epoch": 1.0687747035573123, "grad_norm": 2.219245195388794, "learning_rate": 2.183164135377189e-06, "loss": 3.9719, "step": 1352 }, { "epoch": 1.0695652173913044, "grad_norm": 2.1126625537872314, "learning_rate": 2.1820472410066024e-06, "loss": 3.7885, "step": 1353 }, { "epoch": 1.0703557312252965, "grad_norm": 2.094639778137207, "learning_rate": 2.1809298697311144e-06, "loss": 4.2202, "step": 1354 }, { "epoch": 1.0711462450592886, "grad_norm": 2.182352304458618, "learning_rate": 2.1798120223320188e-06, "loss": 3.9478, "step": 1355 }, { "epoch": 1.0719367588932807, "grad_norm": 1.9283579587936401, "learning_rate": 2.1786936995909432e-06, "loss": 4.1541, "step": 1356 }, { "epoch": 1.0727272727272728, "grad_norm": 2.275405168533325, "learning_rate": 2.177574902289848e-06, "loss": 4.0397, "step": 1357 }, { "epoch": 1.0735177865612648, "grad_norm": 2.4294352531433105, "learning_rate": 2.1764556312110246e-06, "loss": 4.0201, "step": 1358 }, { "epoch": 1.074308300395257, "grad_norm": 2.03711199760437, "learning_rate": 2.1753358871370957e-06, "loss": 3.8294, "step": 1359 }, { "epoch": 1.075098814229249, "grad_norm": 2.3146920204162598, "learning_rate": 2.174215670851016e-06, "loss": 3.7905, "step": 1360 }, { "epoch": 1.0758893280632411, "grad_norm": 3.0557079315185547, "learning_rate": 2.1730949831360683e-06, "loss": 3.8915, "step": 1361 }, { "epoch": 1.0766798418972332, "grad_norm": 2.100375175476074, "learning_rate": 2.171973824775867e-06, "loss": 4.0453, "step": 1362 }, { "epoch": 1.0774703557312253, "grad_norm": 2.004873514175415, "learning_rate": 2.170852196554354e-06, "loss": 4.1593, "step": 1363 }, { "epoch": 1.0782608695652174, "grad_norm": 2.262510061264038, "learning_rate": 2.169730099255801e-06, "loss": 3.7841, "step": 1364 }, { "epoch": 1.0790513833992095, "grad_norm": 2.0645906925201416, "learning_rate": 2.1686075336648078e-06, "loss": 4.0431, "step": 1365 }, { "epoch": 1.0798418972332016, "grad_norm": 2.184119701385498, "learning_rate": 2.1674845005662992e-06, "loss": 4.0262, "step": 1366 }, { "epoch": 1.0806324110671937, "grad_norm": 2.0748050212860107, "learning_rate": 2.1663610007455306e-06, "loss": 3.9383, "step": 1367 }, { "epoch": 1.0814229249011857, "grad_norm": 2.0542962551116943, "learning_rate": 2.1652370349880808e-06, "loss": 4.0825, "step": 1368 }, { "epoch": 1.0822134387351778, "grad_norm": 2.2507805824279785, "learning_rate": 2.164112604079856e-06, "loss": 4.0645, "step": 1369 }, { "epoch": 1.08300395256917, "grad_norm": 2.1258931159973145, "learning_rate": 2.162987708807086e-06, "loss": 4.3389, "step": 1370 }, { "epoch": 1.083794466403162, "grad_norm": 2.282353401184082, "learning_rate": 2.161862349956328e-06, "loss": 4.0108, "step": 1371 }, { "epoch": 1.084584980237154, "grad_norm": 2.2821943759918213, "learning_rate": 2.1607365283144604e-06, "loss": 3.9506, "step": 1372 }, { "epoch": 1.0853754940711462, "grad_norm": 2.183332681655884, "learning_rate": 2.1596102446686876e-06, "loss": 3.9693, "step": 1373 }, { "epoch": 1.0861660079051383, "grad_norm": 2.1429736614227295, "learning_rate": 2.1584834998065347e-06, "loss": 4.0108, "step": 1374 }, { "epoch": 1.0869565217391304, "grad_norm": 2.571254253387451, "learning_rate": 2.157356294515852e-06, "loss": 4.0481, "step": 1375 }, { "epoch": 1.0877470355731225, "grad_norm": 2.0066440105438232, "learning_rate": 2.156228629584809e-06, "loss": 4.1151, "step": 1376 }, { "epoch": 1.0885375494071146, "grad_norm": 2.247004747390747, "learning_rate": 2.1551005058018987e-06, "loss": 4.0261, "step": 1377 }, { "epoch": 1.0893280632411066, "grad_norm": 2.055650234222412, "learning_rate": 2.153971923955934e-06, "loss": 4.1663, "step": 1378 }, { "epoch": 1.0901185770750987, "grad_norm": 2.103832244873047, "learning_rate": 2.1528428848360483e-06, "loss": 4.092, "step": 1379 }, { "epoch": 1.0909090909090908, "grad_norm": 2.288001537322998, "learning_rate": 2.1517133892316947e-06, "loss": 3.6301, "step": 1380 }, { "epoch": 1.091699604743083, "grad_norm": 2.146334648132324, "learning_rate": 2.1505834379326453e-06, "loss": 3.9888, "step": 1381 }, { "epoch": 1.092490118577075, "grad_norm": 1.9856783151626587, "learning_rate": 2.1494530317289908e-06, "loss": 4.2673, "step": 1382 }, { "epoch": 1.093280632411067, "grad_norm": 2.1706995964050293, "learning_rate": 2.1483221714111404e-06, "loss": 3.8497, "step": 1383 }, { "epoch": 1.0940711462450592, "grad_norm": 3.2591047286987305, "learning_rate": 2.1471908577698205e-06, "loss": 3.8122, "step": 1384 }, { "epoch": 1.0948616600790513, "grad_norm": 2.282994508743286, "learning_rate": 2.146059091596075e-06, "loss": 3.775, "step": 1385 }, { "epoch": 1.0956521739130434, "grad_norm": 2.1719753742218018, "learning_rate": 2.1449268736812635e-06, "loss": 4.1436, "step": 1386 }, { "epoch": 1.0964426877470355, "grad_norm": 1.9897669553756714, "learning_rate": 2.1437942048170614e-06, "loss": 4.1345, "step": 1387 }, { "epoch": 1.0972332015810276, "grad_norm": 2.1083037853240967, "learning_rate": 2.14266108579546e-06, "loss": 4.1812, "step": 1388 }, { "epoch": 1.0980237154150199, "grad_norm": 2.1097238063812256, "learning_rate": 2.1415275174087657e-06, "loss": 4.0393, "step": 1389 }, { "epoch": 1.098814229249012, "grad_norm": 2.224491834640503, "learning_rate": 2.1403935004495983e-06, "loss": 3.8407, "step": 1390 }, { "epoch": 1.099604743083004, "grad_norm": 2.0828213691711426, "learning_rate": 2.1392590357108908e-06, "loss": 3.9362, "step": 1391 }, { "epoch": 1.1003952569169961, "grad_norm": 2.205451726913452, "learning_rate": 2.1381241239858907e-06, "loss": 4.0107, "step": 1392 }, { "epoch": 1.1011857707509882, "grad_norm": 1.9696890115737915, "learning_rate": 2.1369887660681575e-06, "loss": 4.2607, "step": 1393 }, { "epoch": 1.1019762845849803, "grad_norm": 2.221892833709717, "learning_rate": 2.1358529627515617e-06, "loss": 4.0296, "step": 1394 }, { "epoch": 1.1027667984189724, "grad_norm": 2.3276500701904297, "learning_rate": 2.134716714830287e-06, "loss": 3.8421, "step": 1395 }, { "epoch": 1.1035573122529645, "grad_norm": 2.1312291622161865, "learning_rate": 2.1335800230988266e-06, "loss": 3.9202, "step": 1396 }, { "epoch": 1.1043478260869566, "grad_norm": 2.069363594055176, "learning_rate": 2.1324428883519844e-06, "loss": 4.0741, "step": 1397 }, { "epoch": 1.1051383399209487, "grad_norm": 2.129798173904419, "learning_rate": 2.1313053113848745e-06, "loss": 3.9404, "step": 1398 }, { "epoch": 1.1059288537549408, "grad_norm": 2.11584734916687, "learning_rate": 2.1301672929929194e-06, "loss": 4.1071, "step": 1399 }, { "epoch": 1.1067193675889329, "grad_norm": 2.035417318344116, "learning_rate": 2.129028833971851e-06, "loss": 3.8607, "step": 1400 }, { "epoch": 1.107509881422925, "grad_norm": 2.7795073986053467, "learning_rate": 2.1278899351177088e-06, "loss": 3.7247, "step": 1401 }, { "epoch": 1.108300395256917, "grad_norm": 2.247523069381714, "learning_rate": 2.1267505972268405e-06, "loss": 4.1783, "step": 1402 }, { "epoch": 1.1090909090909091, "grad_norm": 2.126983642578125, "learning_rate": 2.1256108210959e-06, "loss": 4.003, "step": 1403 }, { "epoch": 1.1098814229249012, "grad_norm": 1.8802413940429688, "learning_rate": 2.1244706075218476e-06, "loss": 4.3344, "step": 1404 }, { "epoch": 1.1106719367588933, "grad_norm": 2.1202855110168457, "learning_rate": 2.123329957301951e-06, "loss": 4.1305, "step": 1405 }, { "epoch": 1.1114624505928854, "grad_norm": 2.3102571964263916, "learning_rate": 2.1221888712337804e-06, "loss": 3.8563, "step": 1406 }, { "epoch": 1.1122529644268775, "grad_norm": 2.2935101985931396, "learning_rate": 2.1210473501152136e-06, "loss": 3.861, "step": 1407 }, { "epoch": 1.1130434782608696, "grad_norm": 2.2132043838500977, "learning_rate": 2.119905394744431e-06, "loss": 3.8595, "step": 1408 }, { "epoch": 1.1138339920948617, "grad_norm": 2.1306591033935547, "learning_rate": 2.118763005919918e-06, "loss": 4.104, "step": 1409 }, { "epoch": 1.1146245059288538, "grad_norm": 2.717627763748169, "learning_rate": 2.1176201844404603e-06, "loss": 3.6851, "step": 1410 }, { "epoch": 1.1154150197628458, "grad_norm": 2.4390509128570557, "learning_rate": 2.1164769311051497e-06, "loss": 3.855, "step": 1411 }, { "epoch": 1.116205533596838, "grad_norm": 2.189591884613037, "learning_rate": 2.115333246713377e-06, "loss": 4.0212, "step": 1412 }, { "epoch": 1.11699604743083, "grad_norm": 2.2113914489746094, "learning_rate": 2.1141891320648357e-06, "loss": 4.2177, "step": 1413 }, { "epoch": 1.1177865612648221, "grad_norm": 2.1588196754455566, "learning_rate": 2.113044587959521e-06, "loss": 3.8131, "step": 1414 }, { "epoch": 1.1185770750988142, "grad_norm": 2.0815742015838623, "learning_rate": 2.111899615197727e-06, "loss": 4.2751, "step": 1415 }, { "epoch": 1.1193675889328063, "grad_norm": 2.196073532104492, "learning_rate": 2.110754214580048e-06, "loss": 4.2863, "step": 1416 }, { "epoch": 1.1201581027667984, "grad_norm": 2.1659107208251953, "learning_rate": 2.1096083869073767e-06, "loss": 4.0538, "step": 1417 }, { "epoch": 1.1209486166007905, "grad_norm": 2.196798086166382, "learning_rate": 2.108462132980906e-06, "loss": 4.0269, "step": 1418 }, { "epoch": 1.1217391304347826, "grad_norm": 2.471928834915161, "learning_rate": 2.1073154536021263e-06, "loss": 3.618, "step": 1419 }, { "epoch": 1.1225296442687747, "grad_norm": 2.1871845722198486, "learning_rate": 2.1061683495728244e-06, "loss": 4.0313, "step": 1420 }, { "epoch": 1.1233201581027668, "grad_norm": 2.106441020965576, "learning_rate": 2.105020821695085e-06, "loss": 4.2569, "step": 1421 }, { "epoch": 1.1241106719367588, "grad_norm": 2.2740273475646973, "learning_rate": 2.1038728707712895e-06, "loss": 3.97, "step": 1422 }, { "epoch": 1.124901185770751, "grad_norm": 2.20605731010437, "learning_rate": 2.1027244976041137e-06, "loss": 3.7651, "step": 1423 }, { "epoch": 1.125691699604743, "grad_norm": 2.7598094940185547, "learning_rate": 2.10157570299653e-06, "loss": 3.8783, "step": 1424 }, { "epoch": 1.1264822134387351, "grad_norm": 3.4910006523132324, "learning_rate": 2.1004264877518056e-06, "loss": 4.068, "step": 1425 }, { "epoch": 1.1272727272727272, "grad_norm": 1.9469693899154663, "learning_rate": 2.0992768526735003e-06, "loss": 4.1565, "step": 1426 }, { "epoch": 1.1280632411067193, "grad_norm": 2.3408448696136475, "learning_rate": 2.098126798565469e-06, "loss": 4.1517, "step": 1427 }, { "epoch": 1.1288537549407114, "grad_norm": 2.4391307830810547, "learning_rate": 2.096976326231858e-06, "loss": 3.3572, "step": 1428 }, { "epoch": 1.1296442687747035, "grad_norm": 2.2574424743652344, "learning_rate": 2.0958254364771085e-06, "loss": 4.214, "step": 1429 }, { "epoch": 1.1304347826086956, "grad_norm": 2.1239287853240967, "learning_rate": 2.0946741301059515e-06, "loss": 4.1362, "step": 1430 }, { "epoch": 1.1312252964426877, "grad_norm": 2.133370876312256, "learning_rate": 2.09352240792341e-06, "loss": 4.0548, "step": 1431 }, { "epoch": 1.1320158102766797, "grad_norm": 2.1157875061035156, "learning_rate": 2.092370270734797e-06, "loss": 4.2271, "step": 1432 }, { "epoch": 1.132806324110672, "grad_norm": 2.2389700412750244, "learning_rate": 2.0912177193457165e-06, "loss": 3.9153, "step": 1433 }, { "epoch": 1.1335968379446641, "grad_norm": 2.1353187561035156, "learning_rate": 2.0900647545620626e-06, "loss": 3.9521, "step": 1434 }, { "epoch": 1.1343873517786562, "grad_norm": 2.417574644088745, "learning_rate": 2.088911377190018e-06, "loss": 4.1885, "step": 1435 }, { "epoch": 1.1351778656126483, "grad_norm": 2.5828168392181396, "learning_rate": 2.0877575880360523e-06, "loss": 3.7441, "step": 1436 }, { "epoch": 1.1359683794466404, "grad_norm": 2.1555426120758057, "learning_rate": 2.0866033879069255e-06, "loss": 4.3709, "step": 1437 }, { "epoch": 1.1367588932806325, "grad_norm": 2.0661940574645996, "learning_rate": 2.0854487776096843e-06, "loss": 4.2199, "step": 1438 }, { "epoch": 1.1375494071146246, "grad_norm": 2.293947696685791, "learning_rate": 2.0842937579516604e-06, "loss": 3.7903, "step": 1439 }, { "epoch": 1.1383399209486167, "grad_norm": 2.2355501651763916, "learning_rate": 2.0831383297404745e-06, "loss": 3.8696, "step": 1440 }, { "epoch": 1.1391304347826088, "grad_norm": 2.4645168781280518, "learning_rate": 2.0819824937840307e-06, "loss": 3.8152, "step": 1441 }, { "epoch": 1.1399209486166009, "grad_norm": 2.023700475692749, "learning_rate": 2.0808262508905194e-06, "loss": 4.1446, "step": 1442 }, { "epoch": 1.140711462450593, "grad_norm": 2.2049524784088135, "learning_rate": 2.0796696018684154e-06, "loss": 4.2022, "step": 1443 }, { "epoch": 1.141501976284585, "grad_norm": 1.9678200483322144, "learning_rate": 2.078512547526477e-06, "loss": 4.3702, "step": 1444 }, { "epoch": 1.1422924901185771, "grad_norm": 2.171170949935913, "learning_rate": 2.077355088673747e-06, "loss": 3.9621, "step": 1445 }, { "epoch": 1.1430830039525692, "grad_norm": 2.5036368370056152, "learning_rate": 2.076197226119549e-06, "loss": 3.774, "step": 1446 }, { "epoch": 1.1438735177865613, "grad_norm": 2.346003293991089, "learning_rate": 2.075038960673491e-06, "loss": 3.9801, "step": 1447 }, { "epoch": 1.1446640316205534, "grad_norm": 2.147608995437622, "learning_rate": 2.0738802931454624e-06, "loss": 4.088, "step": 1448 }, { "epoch": 1.1454545454545455, "grad_norm": 2.4398839473724365, "learning_rate": 2.0727212243456318e-06, "loss": 4.4384, "step": 1449 }, { "epoch": 1.1462450592885376, "grad_norm": 2.212193727493286, "learning_rate": 2.0715617550844505e-06, "loss": 4.092, "step": 1450 }, { "epoch": 1.1470355731225297, "grad_norm": 2.4071381092071533, "learning_rate": 2.0704018861726494e-06, "loss": 4.0257, "step": 1451 }, { "epoch": 1.1478260869565218, "grad_norm": 2.148146152496338, "learning_rate": 2.0692416184212384e-06, "loss": 3.985, "step": 1452 }, { "epoch": 1.1486166007905139, "grad_norm": 2.079291343688965, "learning_rate": 2.0680809526415064e-06, "loss": 4.0022, "step": 1453 }, { "epoch": 1.149407114624506, "grad_norm": 2.199113368988037, "learning_rate": 2.0669198896450208e-06, "loss": 3.9763, "step": 1454 }, { "epoch": 1.150197628458498, "grad_norm": 5.218774795532227, "learning_rate": 2.0657584302436257e-06, "loss": 4.0267, "step": 1455 }, { "epoch": 1.1509881422924901, "grad_norm": 2.148986339569092, "learning_rate": 2.0645965752494445e-06, "loss": 4.2069, "step": 1456 }, { "epoch": 1.1517786561264822, "grad_norm": 2.1089625358581543, "learning_rate": 2.063434325474875e-06, "loss": 4.0092, "step": 1457 }, { "epoch": 1.1525691699604743, "grad_norm": 1.932489037513733, "learning_rate": 2.062271681732593e-06, "loss": 4.3327, "step": 1458 }, { "epoch": 1.1533596837944664, "grad_norm": 2.010065793991089, "learning_rate": 2.0611086448355477e-06, "loss": 4.2774, "step": 1459 }, { "epoch": 1.1541501976284585, "grad_norm": 2.051921844482422, "learning_rate": 2.0599452155969653e-06, "loss": 4.056, "step": 1460 }, { "epoch": 1.1549407114624506, "grad_norm": 2.0864663124084473, "learning_rate": 2.0587813948303444e-06, "loss": 4.0485, "step": 1461 }, { "epoch": 1.1557312252964427, "grad_norm": 2.3873112201690674, "learning_rate": 2.057617183349459e-06, "loss": 4.076, "step": 1462 }, { "epoch": 1.1565217391304348, "grad_norm": 2.1227591037750244, "learning_rate": 2.0564525819683547e-06, "loss": 4.2859, "step": 1463 }, { "epoch": 1.1573122529644269, "grad_norm": 2.0744259357452393, "learning_rate": 2.055287591501352e-06, "loss": 4.0091, "step": 1464 }, { "epoch": 1.158102766798419, "grad_norm": 3.377063274383545, "learning_rate": 2.054122212763041e-06, "loss": 3.8936, "step": 1465 }, { "epoch": 1.158893280632411, "grad_norm": 2.311326503753662, "learning_rate": 2.052956446568285e-06, "loss": 3.9886, "step": 1466 }, { "epoch": 1.1596837944664031, "grad_norm": 2.2459638118743896, "learning_rate": 2.0517902937322166e-06, "loss": 3.7791, "step": 1467 }, { "epoch": 1.1604743083003952, "grad_norm": 2.09824275970459, "learning_rate": 2.050623755070241e-06, "loss": 4.2044, "step": 1468 }, { "epoch": 1.1612648221343873, "grad_norm": 2.6167166233062744, "learning_rate": 2.0494568313980308e-06, "loss": 3.712, "step": 1469 }, { "epoch": 1.1620553359683794, "grad_norm": 2.357640027999878, "learning_rate": 2.04828952353153e-06, "loss": 3.5002, "step": 1470 }, { "epoch": 1.1628458498023715, "grad_norm": 2.093240976333618, "learning_rate": 2.0471218322869488e-06, "loss": 4.1159, "step": 1471 }, { "epoch": 1.1636363636363636, "grad_norm": 2.091573476791382, "learning_rate": 2.045953758480768e-06, "loss": 4.0646, "step": 1472 }, { "epoch": 1.1644268774703557, "grad_norm": 2.2417361736297607, "learning_rate": 2.0447853029297344e-06, "loss": 4.0479, "step": 1473 }, { "epoch": 1.1652173913043478, "grad_norm": 2.1156797409057617, "learning_rate": 2.043616466450862e-06, "loss": 4.134, "step": 1474 }, { "epoch": 1.1660079051383399, "grad_norm": 2.379591941833496, "learning_rate": 2.0424472498614316e-06, "loss": 3.9896, "step": 1475 }, { "epoch": 1.166798418972332, "grad_norm": 2.2574524879455566, "learning_rate": 2.0412776539789884e-06, "loss": 4.1966, "step": 1476 }, { "epoch": 1.167588932806324, "grad_norm": 2.054675817489624, "learning_rate": 2.0401076796213446e-06, "loss": 4.1069, "step": 1477 }, { "epoch": 1.1683794466403161, "grad_norm": 2.4749033451080322, "learning_rate": 2.038937327606576e-06, "loss": 3.7845, "step": 1478 }, { "epoch": 1.1691699604743082, "grad_norm": 2.0602803230285645, "learning_rate": 2.037766598753023e-06, "loss": 3.9696, "step": 1479 }, { "epoch": 1.1699604743083003, "grad_norm": 2.2878048419952393, "learning_rate": 2.036595493879289e-06, "loss": 3.9255, "step": 1480 }, { "epoch": 1.1707509881422924, "grad_norm": 2.482347249984741, "learning_rate": 2.03542401380424e-06, "loss": 3.9905, "step": 1481 }, { "epoch": 1.1715415019762845, "grad_norm": 1.9517629146575928, "learning_rate": 2.034252159347005e-06, "loss": 4.2949, "step": 1482 }, { "epoch": 1.1723320158102766, "grad_norm": 2.0327064990997314, "learning_rate": 2.0330799313269757e-06, "loss": 4.1475, "step": 1483 }, { "epoch": 1.1731225296442687, "grad_norm": 2.7474522590637207, "learning_rate": 2.0319073305638034e-06, "loss": 3.9472, "step": 1484 }, { "epoch": 1.1739130434782608, "grad_norm": 1.9810309410095215, "learning_rate": 2.0307343578774002e-06, "loss": 4.1065, "step": 1485 }, { "epoch": 1.1747035573122528, "grad_norm": 2.071566343307495, "learning_rate": 2.0295610140879392e-06, "loss": 4.1801, "step": 1486 }, { "epoch": 1.1754940711462452, "grad_norm": 2.1847622394561768, "learning_rate": 2.0283873000158514e-06, "loss": 4.1948, "step": 1487 }, { "epoch": 1.1762845849802372, "grad_norm": 2.025266647338867, "learning_rate": 2.027213216481829e-06, "loss": 4.3174, "step": 1488 }, { "epoch": 1.1770750988142293, "grad_norm": 1.99769926071167, "learning_rate": 2.0260387643068213e-06, "loss": 4.2198, "step": 1489 }, { "epoch": 1.1778656126482214, "grad_norm": 2.320268154144287, "learning_rate": 2.0248639443120338e-06, "loss": 4.0189, "step": 1490 }, { "epoch": 1.1786561264822135, "grad_norm": 2.198558807373047, "learning_rate": 2.023688757318932e-06, "loss": 4.0272, "step": 1491 }, { "epoch": 1.1794466403162056, "grad_norm": 2.590247392654419, "learning_rate": 2.0225132041492368e-06, "loss": 3.7263, "step": 1492 }, { "epoch": 1.1802371541501977, "grad_norm": 2.2524635791778564, "learning_rate": 2.0213372856249245e-06, "loss": 3.9962, "step": 1493 }, { "epoch": 1.1810276679841898, "grad_norm": 2.3295302391052246, "learning_rate": 2.0201610025682278e-06, "loss": 3.8429, "step": 1494 }, { "epoch": 1.1818181818181819, "grad_norm": 2.353668689727783, "learning_rate": 2.0189843558016343e-06, "loss": 3.6535, "step": 1495 }, { "epoch": 1.182608695652174, "grad_norm": 2.2424089908599854, "learning_rate": 2.0178073461478845e-06, "loss": 3.9343, "step": 1496 }, { "epoch": 1.183399209486166, "grad_norm": 2.059925079345703, "learning_rate": 2.0166299744299747e-06, "loss": 4.1425, "step": 1497 }, { "epoch": 1.1841897233201581, "grad_norm": 2.487482786178589, "learning_rate": 2.0154522414711526e-06, "loss": 3.7337, "step": 1498 }, { "epoch": 1.1849802371541502, "grad_norm": 2.2517945766448975, "learning_rate": 2.01427414809492e-06, "loss": 4.4896, "step": 1499 }, { "epoch": 1.1857707509881423, "grad_norm": 2.2310140132904053, "learning_rate": 2.0130956951250293e-06, "loss": 4.0627, "step": 1500 }, { "epoch": 1.1865612648221344, "grad_norm": 2.1948037147521973, "learning_rate": 2.011916883385486e-06, "loss": 3.5468, "step": 1501 }, { "epoch": 1.1873517786561265, "grad_norm": 2.1291909217834473, "learning_rate": 2.010737713700544e-06, "loss": 4.1581, "step": 1502 }, { "epoch": 1.1881422924901186, "grad_norm": 2.1103808879852295, "learning_rate": 2.0095581868947102e-06, "loss": 4.2039, "step": 1503 }, { "epoch": 1.1889328063241107, "grad_norm": 2.0917322635650635, "learning_rate": 2.0083783037927393e-06, "loss": 4.1663, "step": 1504 }, { "epoch": 1.1897233201581028, "grad_norm": 2.5950214862823486, "learning_rate": 2.007198065219636e-06, "loss": 3.8093, "step": 1505 }, { "epoch": 1.1905138339920949, "grad_norm": 2.0880143642425537, "learning_rate": 2.0060174720006537e-06, "loss": 4.0153, "step": 1506 }, { "epoch": 1.191304347826087, "grad_norm": 2.14943528175354, "learning_rate": 2.0048365249612924e-06, "loss": 4.1733, "step": 1507 }, { "epoch": 1.192094861660079, "grad_norm": 2.1642074584960938, "learning_rate": 2.0036552249273018e-06, "loss": 3.7666, "step": 1508 }, { "epoch": 1.1928853754940711, "grad_norm": 2.1144328117370605, "learning_rate": 2.0024735727246762e-06, "loss": 3.8627, "step": 1509 }, { "epoch": 1.1936758893280632, "grad_norm": 2.2802963256835938, "learning_rate": 2.0012915691796574e-06, "loss": 3.7801, "step": 1510 }, { "epoch": 1.1944664031620553, "grad_norm": 2.0493569374084473, "learning_rate": 2.0001092151187326e-06, "loss": 4.1129, "step": 1511 }, { "epoch": 1.1952569169960474, "grad_norm": 2.1980769634246826, "learning_rate": 1.998926511368634e-06, "loss": 4.0779, "step": 1512 }, { "epoch": 1.1960474308300395, "grad_norm": 2.0694122314453125, "learning_rate": 1.997743458756338e-06, "loss": 4.1083, "step": 1513 }, { "epoch": 1.1968379446640316, "grad_norm": 2.538334846496582, "learning_rate": 1.996560058109066e-06, "loss": 4.2548, "step": 1514 }, { "epoch": 1.1976284584980237, "grad_norm": 2.0358810424804688, "learning_rate": 1.995376310254282e-06, "loss": 4.2762, "step": 1515 }, { "epoch": 1.1984189723320158, "grad_norm": 2.2423224449157715, "learning_rate": 1.994192216019692e-06, "loss": 3.8159, "step": 1516 }, { "epoch": 1.1992094861660079, "grad_norm": 2.1298282146453857, "learning_rate": 1.993007776233246e-06, "loss": 4.2198, "step": 1517 }, { "epoch": 1.2, "grad_norm": 2.1104695796966553, "learning_rate": 1.991822991723134e-06, "loss": 4.2988, "step": 1518 }, { "epoch": 1.200790513833992, "grad_norm": 2.26631760597229, "learning_rate": 1.9906378633177876e-06, "loss": 3.9821, "step": 1519 }, { "epoch": 1.2015810276679841, "grad_norm": 2.0143826007843018, "learning_rate": 1.9894523918458792e-06, "loss": 4.2517, "step": 1520 }, { "epoch": 1.2023715415019762, "grad_norm": 2.1158740520477295, "learning_rate": 1.988266578136321e-06, "loss": 3.9569, "step": 1521 }, { "epoch": 1.2031620553359683, "grad_norm": 2.09120512008667, "learning_rate": 1.987080423018264e-06, "loss": 4.1984, "step": 1522 }, { "epoch": 1.2039525691699604, "grad_norm": 2.037255048751831, "learning_rate": 1.9858939273210983e-06, "loss": 4.3066, "step": 1523 }, { "epoch": 1.2047430830039525, "grad_norm": 2.1485722064971924, "learning_rate": 1.9847070918744525e-06, "loss": 3.8854, "step": 1524 }, { "epoch": 1.2055335968379446, "grad_norm": 2.050283908843994, "learning_rate": 1.983519917508192e-06, "loss": 4.2589, "step": 1525 }, { "epoch": 1.2063241106719367, "grad_norm": 2.0792157649993896, "learning_rate": 1.982332405052419e-06, "loss": 4.1052, "step": 1526 }, { "epoch": 1.2071146245059288, "grad_norm": 2.473686695098877, "learning_rate": 1.981144555337474e-06, "loss": 3.836, "step": 1527 }, { "epoch": 1.2079051383399209, "grad_norm": 2.1057801246643066, "learning_rate": 1.9799563691939304e-06, "loss": 3.9329, "step": 1528 }, { "epoch": 1.208695652173913, "grad_norm": 2.1520676612854004, "learning_rate": 1.9787678474525993e-06, "loss": 3.856, "step": 1529 }, { "epoch": 1.2094861660079053, "grad_norm": 2.2979652881622314, "learning_rate": 1.9775789909445253e-06, "loss": 3.9216, "step": 1530 }, { "epoch": 1.2102766798418974, "grad_norm": 2.108942747116089, "learning_rate": 1.976389800500988e-06, "loss": 3.9831, "step": 1531 }, { "epoch": 1.2110671936758894, "grad_norm": 2.5085113048553467, "learning_rate": 1.9752002769534985e-06, "loss": 3.9172, "step": 1532 }, { "epoch": 1.2118577075098815, "grad_norm": 2.240234375, "learning_rate": 1.974010421133803e-06, "loss": 4.1432, "step": 1533 }, { "epoch": 1.2126482213438736, "grad_norm": 3.742997884750366, "learning_rate": 1.972820233873879e-06, "loss": 4.2001, "step": 1534 }, { "epoch": 1.2134387351778657, "grad_norm": 2.181192636489868, "learning_rate": 1.971629716005936e-06, "loss": 4.1122, "step": 1535 }, { "epoch": 1.2142292490118578, "grad_norm": 2.116905450820923, "learning_rate": 1.970438868362414e-06, "loss": 4.1618, "step": 1536 }, { "epoch": 1.21501976284585, "grad_norm": 2.1239283084869385, "learning_rate": 1.969247691775985e-06, "loss": 4.1542, "step": 1537 }, { "epoch": 1.215810276679842, "grad_norm": 2.376891613006592, "learning_rate": 1.96805618707955e-06, "loss": 3.75, "step": 1538 }, { "epoch": 1.216600790513834, "grad_norm": 2.171614408493042, "learning_rate": 1.9668643551062393e-06, "loss": 4.0201, "step": 1539 }, { "epoch": 1.2173913043478262, "grad_norm": 2.0883219242095947, "learning_rate": 1.9656721966894124e-06, "loss": 4.0959, "step": 1540 }, { "epoch": 1.2181818181818183, "grad_norm": 2.415523052215576, "learning_rate": 1.9644797126626573e-06, "loss": 4.1904, "step": 1541 }, { "epoch": 1.2189723320158103, "grad_norm": 2.3563196659088135, "learning_rate": 1.963286903859789e-06, "loss": 3.9492, "step": 1542 }, { "epoch": 1.2197628458498024, "grad_norm": 1.985156774520874, "learning_rate": 1.9620937711148507e-06, "loss": 4.38, "step": 1543 }, { "epoch": 1.2205533596837945, "grad_norm": 2.2022509574890137, "learning_rate": 1.9609003152621102e-06, "loss": 4.1792, "step": 1544 }, { "epoch": 1.2213438735177866, "grad_norm": 2.0640406608581543, "learning_rate": 1.9597065371360635e-06, "loss": 4.1707, "step": 1545 }, { "epoch": 1.2221343873517787, "grad_norm": 3.09027361869812, "learning_rate": 1.95851243757143e-06, "loss": 3.9007, "step": 1546 }, { "epoch": 1.2229249011857708, "grad_norm": 2.3172762393951416, "learning_rate": 1.957318017403156e-06, "loss": 3.9308, "step": 1547 }, { "epoch": 1.2237154150197629, "grad_norm": 2.198106288909912, "learning_rate": 1.9561232774664084e-06, "loss": 3.8358, "step": 1548 }, { "epoch": 1.224505928853755, "grad_norm": 2.022758960723877, "learning_rate": 1.954928218596582e-06, "loss": 4.2452, "step": 1549 }, { "epoch": 1.225296442687747, "grad_norm": 2.197477102279663, "learning_rate": 1.953732841629292e-06, "loss": 4.1556, "step": 1550 }, { "epoch": 1.2260869565217392, "grad_norm": 2.150117874145508, "learning_rate": 1.9525371474003765e-06, "loss": 4.2541, "step": 1551 }, { "epoch": 1.2268774703557312, "grad_norm": 2.3054263591766357, "learning_rate": 1.9513411367458955e-06, "loss": 3.7092, "step": 1552 }, { "epoch": 1.2276679841897233, "grad_norm": 2.065427780151367, "learning_rate": 1.95014481050213e-06, "loss": 4.0302, "step": 1553 }, { "epoch": 1.2284584980237154, "grad_norm": 2.7850115299224854, "learning_rate": 1.9489481695055828e-06, "loss": 3.5142, "step": 1554 }, { "epoch": 1.2292490118577075, "grad_norm": 2.004103183746338, "learning_rate": 1.9477512145929744e-06, "loss": 4.1091, "step": 1555 }, { "epoch": 1.2300395256916996, "grad_norm": 2.0483057498931885, "learning_rate": 1.9465539466012487e-06, "loss": 4.1211, "step": 1556 }, { "epoch": 1.2308300395256917, "grad_norm": 2.3070883750915527, "learning_rate": 1.945356366367564e-06, "loss": 3.9606, "step": 1557 }, { "epoch": 1.2316205533596838, "grad_norm": 2.010657548904419, "learning_rate": 1.9441584747293e-06, "loss": 4.2453, "step": 1558 }, { "epoch": 1.2324110671936759, "grad_norm": 2.0644540786743164, "learning_rate": 1.942960272524053e-06, "loss": 4.3266, "step": 1559 }, { "epoch": 1.233201581027668, "grad_norm": 2.179370403289795, "learning_rate": 1.941761760589637e-06, "loss": 3.9959, "step": 1560 }, { "epoch": 1.23399209486166, "grad_norm": 2.061033248901367, "learning_rate": 1.9405629397640818e-06, "loss": 4.1574, "step": 1561 }, { "epoch": 1.2347826086956522, "grad_norm": 2.3140883445739746, "learning_rate": 1.939363810885634e-06, "loss": 4.067, "step": 1562 }, { "epoch": 1.2355731225296442, "grad_norm": 2.159723997116089, "learning_rate": 1.938164374792755e-06, "loss": 3.886, "step": 1563 }, { "epoch": 1.2363636363636363, "grad_norm": 1.8726569414138794, "learning_rate": 1.9369646323241213e-06, "loss": 4.336, "step": 1564 }, { "epoch": 1.2371541501976284, "grad_norm": 2.3488376140594482, "learning_rate": 1.9357645843186237e-06, "loss": 3.9017, "step": 1565 }, { "epoch": 1.2379446640316205, "grad_norm": 2.151554822921753, "learning_rate": 1.9345642316153665e-06, "loss": 4.1743, "step": 1566 }, { "epoch": 1.2387351778656126, "grad_norm": 2.154963254928589, "learning_rate": 1.9333635750536664e-06, "loss": 4.0003, "step": 1567 }, { "epoch": 1.2395256916996047, "grad_norm": 2.4501819610595703, "learning_rate": 1.9321626154730543e-06, "loss": 3.568, "step": 1568 }, { "epoch": 1.2403162055335968, "grad_norm": 2.0423426628112793, "learning_rate": 1.930961353713271e-06, "loss": 4.0278, "step": 1569 }, { "epoch": 1.2411067193675889, "grad_norm": 2.150160312652588, "learning_rate": 1.92975979061427e-06, "loss": 4.1139, "step": 1570 }, { "epoch": 1.241897233201581, "grad_norm": 2.379758596420288, "learning_rate": 1.9285579270162148e-06, "loss": 3.9587, "step": 1571 }, { "epoch": 1.242687747035573, "grad_norm": 2.1659607887268066, "learning_rate": 1.9273557637594795e-06, "loss": 3.9704, "step": 1572 }, { "epoch": 1.2434782608695651, "grad_norm": 2.7558279037475586, "learning_rate": 1.926153301684647e-06, "loss": 3.5218, "step": 1573 }, { "epoch": 1.2442687747035572, "grad_norm": 2.228193759918213, "learning_rate": 1.92495054163251e-06, "loss": 3.9257, "step": 1574 }, { "epoch": 1.2450592885375493, "grad_norm": 2.055360794067383, "learning_rate": 1.9237474844440687e-06, "loss": 3.9543, "step": 1575 }, { "epoch": 1.2458498023715414, "grad_norm": 2.462397336959839, "learning_rate": 1.922544130960532e-06, "loss": 3.9704, "step": 1576 }, { "epoch": 1.2466403162055335, "grad_norm": 2.1739232540130615, "learning_rate": 1.9213404820233144e-06, "loss": 4.1322, "step": 1577 }, { "epoch": 1.2474308300395256, "grad_norm": 2.022265911102295, "learning_rate": 1.92013653847404e-06, "loss": 4.1971, "step": 1578 }, { "epoch": 1.2482213438735177, "grad_norm": 2.142249584197998, "learning_rate": 1.9189323011545354e-06, "loss": 4.0815, "step": 1579 }, { "epoch": 1.2490118577075098, "grad_norm": 2.0046756267547607, "learning_rate": 1.9177277709068348e-06, "loss": 4.2186, "step": 1580 }, { "epoch": 1.2498023715415019, "grad_norm": 2.0792031288146973, "learning_rate": 1.9165229485731758e-06, "loss": 4.0943, "step": 1581 }, { "epoch": 1.250592885375494, "grad_norm": 2.372459888458252, "learning_rate": 1.915317834996002e-06, "loss": 4.0618, "step": 1582 }, { "epoch": 1.251383399209486, "grad_norm": 2.328979730606079, "learning_rate": 1.9141124310179595e-06, "loss": 4.1313, "step": 1583 }, { "epoch": 1.2521739130434781, "grad_norm": 2.1089773178100586, "learning_rate": 1.9129067374818975e-06, "loss": 4.2089, "step": 1584 }, { "epoch": 1.2529644268774702, "grad_norm": 2.0419559478759766, "learning_rate": 1.911700755230868e-06, "loss": 4.0915, "step": 1585 }, { "epoch": 1.2537549407114623, "grad_norm": 2.528593063354492, "learning_rate": 1.9104944851081246e-06, "loss": 3.9963, "step": 1586 }, { "epoch": 1.2545454545454544, "grad_norm": 2.2620997428894043, "learning_rate": 1.9092879279571214e-06, "loss": 3.9518, "step": 1587 }, { "epoch": 1.2553359683794467, "grad_norm": 2.1870570182800293, "learning_rate": 1.9080810846215155e-06, "loss": 3.8822, "step": 1588 }, { "epoch": 1.2561264822134388, "grad_norm": 2.4856836795806885, "learning_rate": 1.9068739559451617e-06, "loss": 4.123, "step": 1589 }, { "epoch": 1.256916996047431, "grad_norm": 2.131627082824707, "learning_rate": 1.9056665427721158e-06, "loss": 4.0477, "step": 1590 }, { "epoch": 1.257707509881423, "grad_norm": 2.018416404724121, "learning_rate": 1.9044588459466316e-06, "loss": 4.2442, "step": 1591 }, { "epoch": 1.258498023715415, "grad_norm": 2.074655294418335, "learning_rate": 1.9032508663131622e-06, "loss": 4.0577, "step": 1592 }, { "epoch": 1.2592885375494072, "grad_norm": 2.0978341102600098, "learning_rate": 1.9020426047163573e-06, "loss": 4.158, "step": 1593 }, { "epoch": 1.2600790513833993, "grad_norm": 2.02288556098938, "learning_rate": 1.9008340620010643e-06, "loss": 4.1281, "step": 1594 }, { "epoch": 1.2608695652173914, "grad_norm": 2.091020107269287, "learning_rate": 1.899625239012328e-06, "loss": 4.0614, "step": 1595 }, { "epoch": 1.2616600790513834, "grad_norm": 2.411078929901123, "learning_rate": 1.8984161365953881e-06, "loss": 3.8419, "step": 1596 }, { "epoch": 1.2624505928853755, "grad_norm": 2.609356164932251, "learning_rate": 1.8972067555956794e-06, "loss": 3.4371, "step": 1597 }, { "epoch": 1.2632411067193676, "grad_norm": 2.5248188972473145, "learning_rate": 1.8959970968588325e-06, "loss": 3.6446, "step": 1598 }, { "epoch": 1.2640316205533597, "grad_norm": 2.160027027130127, "learning_rate": 1.8947871612306722e-06, "loss": 4.1241, "step": 1599 }, { "epoch": 1.2648221343873518, "grad_norm": 2.1419014930725098, "learning_rate": 1.8935769495572154e-06, "loss": 3.9811, "step": 1600 }, { "epoch": 1.265612648221344, "grad_norm": 2.045757532119751, "learning_rate": 1.8923664626846747e-06, "loss": 4.1361, "step": 1601 }, { "epoch": 1.266403162055336, "grad_norm": 2.126884937286377, "learning_rate": 1.8911557014594518e-06, "loss": 4.1719, "step": 1602 }, { "epoch": 1.267193675889328, "grad_norm": 2.0277910232543945, "learning_rate": 1.8899446667281433e-06, "loss": 4.1185, "step": 1603 }, { "epoch": 1.2679841897233202, "grad_norm": 2.3413820266723633, "learning_rate": 1.8887333593375346e-06, "loss": 4.0415, "step": 1604 }, { "epoch": 1.2687747035573123, "grad_norm": 2.1304986476898193, "learning_rate": 1.8875217801346039e-06, "loss": 4.0378, "step": 1605 }, { "epoch": 1.2695652173913043, "grad_norm": 2.139681100845337, "learning_rate": 1.8863099299665175e-06, "loss": 4.0138, "step": 1606 }, { "epoch": 1.2703557312252964, "grad_norm": 2.2658514976501465, "learning_rate": 1.885097809680633e-06, "loss": 3.8934, "step": 1607 }, { "epoch": 1.2711462450592885, "grad_norm": 2.1519763469696045, "learning_rate": 1.8838854201244951e-06, "loss": 4.0041, "step": 1608 }, { "epoch": 1.2719367588932806, "grad_norm": 2.1007611751556396, "learning_rate": 1.882672762145838e-06, "loss": 3.9581, "step": 1609 }, { "epoch": 1.2727272727272727, "grad_norm": 2.1095948219299316, "learning_rate": 1.8814598365925835e-06, "loss": 4.0659, "step": 1610 }, { "epoch": 1.2735177865612648, "grad_norm": 2.324256420135498, "learning_rate": 1.88024664431284e-06, "loss": 3.6693, "step": 1611 }, { "epoch": 1.2743083003952569, "grad_norm": 2.1863300800323486, "learning_rate": 1.8790331861549024e-06, "loss": 4.172, "step": 1612 }, { "epoch": 1.275098814229249, "grad_norm": 2.212843656539917, "learning_rate": 1.8778194629672516e-06, "loss": 3.9576, "step": 1613 }, { "epoch": 1.275889328063241, "grad_norm": 2.3521358966827393, "learning_rate": 1.8766054755985544e-06, "loss": 4.0682, "step": 1614 }, { "epoch": 1.2766798418972332, "grad_norm": 2.0837104320526123, "learning_rate": 1.8753912248976618e-06, "loss": 4.0699, "step": 1615 }, { "epoch": 1.2774703557312252, "grad_norm": 2.147761821746826, "learning_rate": 1.8741767117136088e-06, "loss": 4.1261, "step": 1616 }, { "epoch": 1.2782608695652173, "grad_norm": 2.0941550731658936, "learning_rate": 1.8729619368956146e-06, "loss": 4.1139, "step": 1617 }, { "epoch": 1.2790513833992094, "grad_norm": 2.0058693885803223, "learning_rate": 1.87174690129308e-06, "loss": 4.1595, "step": 1618 }, { "epoch": 1.2798418972332015, "grad_norm": 2.242588520050049, "learning_rate": 1.8705316057555895e-06, "loss": 3.7634, "step": 1619 }, { "epoch": 1.2806324110671936, "grad_norm": 2.1915576457977295, "learning_rate": 1.869316051132909e-06, "loss": 3.9897, "step": 1620 }, { "epoch": 1.2814229249011857, "grad_norm": 2.239666223526001, "learning_rate": 1.8681002382749856e-06, "loss": 3.7824, "step": 1621 }, { "epoch": 1.2822134387351778, "grad_norm": 2.2389426231384277, "learning_rate": 1.8668841680319457e-06, "loss": 3.9639, "step": 1622 }, { "epoch": 1.2830039525691699, "grad_norm": 2.0692460536956787, "learning_rate": 1.8656678412540986e-06, "loss": 4.1869, "step": 1623 }, { "epoch": 1.2837944664031622, "grad_norm": 2.227144241333008, "learning_rate": 1.8644512587919293e-06, "loss": 3.9063, "step": 1624 }, { "epoch": 1.2845849802371543, "grad_norm": 2.205573320388794, "learning_rate": 1.8632344214961047e-06, "loss": 3.9996, "step": 1625 }, { "epoch": 1.2853754940711464, "grad_norm": 2.1790401935577393, "learning_rate": 1.8620173302174676e-06, "loss": 3.9886, "step": 1626 }, { "epoch": 1.2861660079051385, "grad_norm": 2.2918338775634766, "learning_rate": 1.8607999858070404e-06, "loss": 3.3103, "step": 1627 }, { "epoch": 1.2869565217391306, "grad_norm": 3.049720048904419, "learning_rate": 1.859582389116021e-06, "loss": 3.5963, "step": 1628 }, { "epoch": 1.2877470355731226, "grad_norm": 2.329535484313965, "learning_rate": 1.858364540995784e-06, "loss": 3.6731, "step": 1629 }, { "epoch": 1.2885375494071147, "grad_norm": 4.034201622009277, "learning_rate": 1.8571464422978802e-06, "loss": 3.9205, "step": 1630 }, { "epoch": 1.2893280632411068, "grad_norm": 2.14461088180542, "learning_rate": 1.855928093874036e-06, "loss": 3.9689, "step": 1631 }, { "epoch": 1.290118577075099, "grad_norm": 1.9174836874008179, "learning_rate": 1.85470949657615e-06, "loss": 4.4037, "step": 1632 }, { "epoch": 1.290909090909091, "grad_norm": 2.195953369140625, "learning_rate": 1.8534906512562992e-06, "loss": 3.6697, "step": 1633 }, { "epoch": 1.291699604743083, "grad_norm": 2.5484347343444824, "learning_rate": 1.8522715587667298e-06, "loss": 3.9707, "step": 1634 }, { "epoch": 1.2924901185770752, "grad_norm": 2.307874917984009, "learning_rate": 1.8510522199598626e-06, "loss": 3.7029, "step": 1635 }, { "epoch": 1.2932806324110673, "grad_norm": 2.0932958126068115, "learning_rate": 1.8498326356882907e-06, "loss": 3.9903, "step": 1636 }, { "epoch": 1.2940711462450594, "grad_norm": 2.2938828468322754, "learning_rate": 1.8486128068047794e-06, "loss": 3.7758, "step": 1637 }, { "epoch": 1.2948616600790515, "grad_norm": 2.3308370113372803, "learning_rate": 1.847392734162263e-06, "loss": 4.0166, "step": 1638 }, { "epoch": 1.2956521739130435, "grad_norm": 2.0103251934051514, "learning_rate": 1.8461724186138485e-06, "loss": 4.2547, "step": 1639 }, { "epoch": 1.2964426877470356, "grad_norm": 2.475212574005127, "learning_rate": 1.8449518610128115e-06, "loss": 3.4078, "step": 1640 }, { "epoch": 1.2972332015810277, "grad_norm": 2.065863609313965, "learning_rate": 1.8437310622125973e-06, "loss": 3.9642, "step": 1641 }, { "epoch": 1.2980237154150198, "grad_norm": 2.260671854019165, "learning_rate": 1.8425100230668188e-06, "loss": 3.7393, "step": 1642 }, { "epoch": 1.298814229249012, "grad_norm": 2.217287302017212, "learning_rate": 1.8412887444292597e-06, "loss": 3.8215, "step": 1643 }, { "epoch": 1.299604743083004, "grad_norm": 2.137362003326416, "learning_rate": 1.8400672271538673e-06, "loss": 4.0816, "step": 1644 }, { "epoch": 1.300395256916996, "grad_norm": 2.2354140281677246, "learning_rate": 1.838845472094759e-06, "loss": 3.792, "step": 1645 }, { "epoch": 1.3011857707509882, "grad_norm": 2.263193130493164, "learning_rate": 1.837623480106217e-06, "loss": 4.2808, "step": 1646 }, { "epoch": 1.3019762845849803, "grad_norm": 2.512347459793091, "learning_rate": 1.8364012520426892e-06, "loss": 4.0179, "step": 1647 }, { "epoch": 1.3027667984189724, "grad_norm": 2.258831739425659, "learning_rate": 1.8351787887587884e-06, "loss": 3.955, "step": 1648 }, { "epoch": 1.3035573122529645, "grad_norm": 2.2279250621795654, "learning_rate": 1.833956091109293e-06, "loss": 3.9268, "step": 1649 }, { "epoch": 1.3043478260869565, "grad_norm": 2.191500186920166, "learning_rate": 1.8327331599491446e-06, "loss": 3.9186, "step": 1650 }, { "epoch": 1.3051383399209486, "grad_norm": 2.3113558292388916, "learning_rate": 1.831509996133447e-06, "loss": 3.7915, "step": 1651 }, { "epoch": 1.3059288537549407, "grad_norm": 2.221848964691162, "learning_rate": 1.8302866005174687e-06, "loss": 4.085, "step": 1652 }, { "epoch": 1.3067193675889328, "grad_norm": 2.0689566135406494, "learning_rate": 1.829062973956639e-06, "loss": 4.0241, "step": 1653 }, { "epoch": 1.307509881422925, "grad_norm": 2.3396670818328857, "learning_rate": 1.8278391173065484e-06, "loss": 4.104, "step": 1654 }, { "epoch": 1.308300395256917, "grad_norm": 3.157404661178589, "learning_rate": 1.8266150314229494e-06, "loss": 3.7691, "step": 1655 }, { "epoch": 1.309090909090909, "grad_norm": 2.0163607597351074, "learning_rate": 1.8253907171617537e-06, "loss": 4.2143, "step": 1656 }, { "epoch": 1.3098814229249012, "grad_norm": 2.3444080352783203, "learning_rate": 1.8241661753790337e-06, "loss": 3.5743, "step": 1657 }, { "epoch": 1.3106719367588933, "grad_norm": 2.9186949729919434, "learning_rate": 1.8229414069310205e-06, "loss": 3.8701, "step": 1658 }, { "epoch": 1.3114624505928854, "grad_norm": 2.132711887359619, "learning_rate": 1.8217164126741028e-06, "loss": 4.0706, "step": 1659 }, { "epoch": 1.3122529644268774, "grad_norm": 1.9922256469726562, "learning_rate": 1.8204911934648283e-06, "loss": 4.419, "step": 1660 }, { "epoch": 1.3130434782608695, "grad_norm": 2.3229289054870605, "learning_rate": 1.8192657501599018e-06, "loss": 3.7995, "step": 1661 }, { "epoch": 1.3138339920948616, "grad_norm": 2.202925443649292, "learning_rate": 1.8180400836161852e-06, "loss": 4.1215, "step": 1662 }, { "epoch": 1.3146245059288537, "grad_norm": 2.1832668781280518, "learning_rate": 1.8168141946906947e-06, "loss": 3.7908, "step": 1663 }, { "epoch": 1.3154150197628458, "grad_norm": 2.0845837593078613, "learning_rate": 1.8155880842406042e-06, "loss": 3.8424, "step": 1664 }, { "epoch": 1.316205533596838, "grad_norm": 2.44409441947937, "learning_rate": 1.814361753123241e-06, "loss": 3.7069, "step": 1665 }, { "epoch": 1.31699604743083, "grad_norm": 2.3064122200012207, "learning_rate": 1.8131352021960876e-06, "loss": 3.8767, "step": 1666 }, { "epoch": 1.317786561264822, "grad_norm": 2.412645101547241, "learning_rate": 1.8119084323167796e-06, "loss": 3.5874, "step": 1667 }, { "epoch": 1.3185770750988142, "grad_norm": 2.01531982421875, "learning_rate": 1.810681444343106e-06, "loss": 4.0725, "step": 1668 }, { "epoch": 1.3193675889328063, "grad_norm": 2.1090683937072754, "learning_rate": 1.8094542391330086e-06, "loss": 4.1988, "step": 1669 }, { "epoch": 1.3201581027667983, "grad_norm": 2.7669289112091064, "learning_rate": 1.80822681754458e-06, "loss": 4.0339, "step": 1670 }, { "epoch": 1.3209486166007904, "grad_norm": 2.1565651893615723, "learning_rate": 1.806999180436065e-06, "loss": 4.0641, "step": 1671 }, { "epoch": 1.3217391304347825, "grad_norm": 2.5975821018218994, "learning_rate": 1.8057713286658595e-06, "loss": 4.027, "step": 1672 }, { "epoch": 1.3225296442687746, "grad_norm": 2.170956611633301, "learning_rate": 1.8045432630925082e-06, "loss": 3.6456, "step": 1673 }, { "epoch": 1.3233201581027667, "grad_norm": 2.744940996170044, "learning_rate": 1.8033149845747062e-06, "loss": 3.5037, "step": 1674 }, { "epoch": 1.3241106719367588, "grad_norm": 2.304992198944092, "learning_rate": 1.8020864939712977e-06, "loss": 3.9591, "step": 1675 }, { "epoch": 1.3249011857707509, "grad_norm": 1.984457015991211, "learning_rate": 1.8008577921412747e-06, "loss": 4.3798, "step": 1676 }, { "epoch": 1.325691699604743, "grad_norm": 2.2826459407806396, "learning_rate": 1.7996288799437761e-06, "loss": 3.8664, "step": 1677 }, { "epoch": 1.326482213438735, "grad_norm": 2.531522035598755, "learning_rate": 1.79839975823809e-06, "loss": 3.8256, "step": 1678 }, { "epoch": 1.3272727272727272, "grad_norm": 2.2025654315948486, "learning_rate": 1.797170427883649e-06, "loss": 3.8636, "step": 1679 }, { "epoch": 1.3280632411067192, "grad_norm": 2.33943247795105, "learning_rate": 1.7959408897400326e-06, "loss": 3.9204, "step": 1680 }, { "epoch": 1.3288537549407113, "grad_norm": 2.1363189220428467, "learning_rate": 1.7947111446669655e-06, "loss": 4.2197, "step": 1681 }, { "epoch": 1.3296442687747034, "grad_norm": 2.539590835571289, "learning_rate": 1.7934811935243173e-06, "loss": 4.2475, "step": 1682 }, { "epoch": 1.3304347826086955, "grad_norm": 2.1570324897766113, "learning_rate": 1.7922510371721e-06, "loss": 4.0362, "step": 1683 }, { "epoch": 1.3312252964426876, "grad_norm": 2.3283498287200928, "learning_rate": 1.791020676470472e-06, "loss": 3.9551, "step": 1684 }, { "epoch": 1.33201581027668, "grad_norm": 2.0759437084198, "learning_rate": 1.7897901122797321e-06, "loss": 4.0939, "step": 1685 }, { "epoch": 1.332806324110672, "grad_norm": 1.9600073099136353, "learning_rate": 1.788559345460323e-06, "loss": 4.2804, "step": 1686 }, { "epoch": 1.333596837944664, "grad_norm": 2.214916229248047, "learning_rate": 1.7873283768728268e-06, "loss": 3.902, "step": 1687 }, { "epoch": 1.3343873517786562, "grad_norm": 2.0842297077178955, "learning_rate": 1.7860972073779705e-06, "loss": 4.2504, "step": 1688 }, { "epoch": 1.3351778656126483, "grad_norm": 2.0273215770721436, "learning_rate": 1.7848658378366177e-06, "loss": 4.2655, "step": 1689 }, { "epoch": 1.3359683794466404, "grad_norm": 2.209808826446533, "learning_rate": 1.7836342691097743e-06, "loss": 4.3029, "step": 1690 }, { "epoch": 1.3367588932806325, "grad_norm": 2.211700201034546, "learning_rate": 1.7824025020585847e-06, "loss": 3.7274, "step": 1691 }, { "epoch": 1.3375494071146246, "grad_norm": 2.038367748260498, "learning_rate": 1.7811705375443315e-06, "loss": 4.3051, "step": 1692 }, { "epoch": 1.3383399209486166, "grad_norm": 2.173611879348755, "learning_rate": 1.7799383764284356e-06, "loss": 4.054, "step": 1693 }, { "epoch": 1.3391304347826087, "grad_norm": 2.1155178546905518, "learning_rate": 1.7787060195724572e-06, "loss": 3.9209, "step": 1694 }, { "epoch": 1.3399209486166008, "grad_norm": 2.067927837371826, "learning_rate": 1.7774734678380899e-06, "loss": 3.8829, "step": 1695 }, { "epoch": 1.340711462450593, "grad_norm": 2.592780590057373, "learning_rate": 1.7762407220871662e-06, "loss": 3.5643, "step": 1696 }, { "epoch": 1.341501976284585, "grad_norm": 2.5587828159332275, "learning_rate": 1.775007783181654e-06, "loss": 3.7626, "step": 1697 }, { "epoch": 1.342292490118577, "grad_norm": 2.284315347671509, "learning_rate": 1.7737746519836552e-06, "loss": 4.0489, "step": 1698 }, { "epoch": 1.3430830039525692, "grad_norm": 2.059884786605835, "learning_rate": 1.7725413293554064e-06, "loss": 4.0098, "step": 1699 }, { "epoch": 1.3438735177865613, "grad_norm": 2.0869202613830566, "learning_rate": 1.7713078161592795e-06, "loss": 4.464, "step": 1700 }, { "epoch": 1.3446640316205534, "grad_norm": 2.4743828773498535, "learning_rate": 1.7700741132577775e-06, "loss": 3.9231, "step": 1701 }, { "epoch": 1.3454545454545455, "grad_norm": 2.2377071380615234, "learning_rate": 1.7688402215135375e-06, "loss": 3.7555, "step": 1702 }, { "epoch": 1.3462450592885375, "grad_norm": 2.1792526245117188, "learning_rate": 1.7676061417893277e-06, "loss": 4.0836, "step": 1703 }, { "epoch": 1.3470355731225296, "grad_norm": 2.207141160964966, "learning_rate": 1.7663718749480492e-06, "loss": 3.952, "step": 1704 }, { "epoch": 1.3478260869565217, "grad_norm": 2.0757462978363037, "learning_rate": 1.7651374218527324e-06, "loss": 4.1955, "step": 1705 }, { "epoch": 1.3486166007905138, "grad_norm": 2.1874349117279053, "learning_rate": 1.7639027833665383e-06, "loss": 3.9815, "step": 1706 }, { "epoch": 1.349407114624506, "grad_norm": 2.220123052597046, "learning_rate": 1.7626679603527582e-06, "loss": 4.1057, "step": 1707 }, { "epoch": 1.350197628458498, "grad_norm": 2.4246890544891357, "learning_rate": 1.761432953674812e-06, "loss": 3.9122, "step": 1708 }, { "epoch": 1.35098814229249, "grad_norm": 2.2340548038482666, "learning_rate": 1.7601977641962476e-06, "loss": 3.9887, "step": 1709 }, { "epoch": 1.3517786561264822, "grad_norm": 2.1134748458862305, "learning_rate": 1.7589623927807414e-06, "loss": 3.9001, "step": 1710 }, { "epoch": 1.3525691699604743, "grad_norm": 3.3577349185943604, "learning_rate": 1.7577268402920969e-06, "loss": 4.0402, "step": 1711 }, { "epoch": 1.3533596837944664, "grad_norm": 2.397693157196045, "learning_rate": 1.7564911075942434e-06, "loss": 3.7242, "step": 1712 }, { "epoch": 1.3541501976284585, "grad_norm": 2.66144061088562, "learning_rate": 1.7552551955512378e-06, "loss": 3.6164, "step": 1713 }, { "epoch": 1.3549407114624505, "grad_norm": 2.087829828262329, "learning_rate": 1.7540191050272605e-06, "loss": 4.0979, "step": 1714 }, { "epoch": 1.3557312252964426, "grad_norm": 2.114757776260376, "learning_rate": 1.7527828368866184e-06, "loss": 4.0378, "step": 1715 }, { "epoch": 1.3565217391304347, "grad_norm": 2.2007594108581543, "learning_rate": 1.7515463919937415e-06, "loss": 4.0625, "step": 1716 }, { "epoch": 1.3573122529644268, "grad_norm": 2.205871343612671, "learning_rate": 1.7503097712131838e-06, "loss": 3.9982, "step": 1717 }, { "epoch": 1.358102766798419, "grad_norm": 2.0667688846588135, "learning_rate": 1.7490729754096224e-06, "loss": 4.1751, "step": 1718 }, { "epoch": 1.358893280632411, "grad_norm": 2.1784465312957764, "learning_rate": 1.7478360054478563e-06, "loss": 4.2542, "step": 1719 }, { "epoch": 1.359683794466403, "grad_norm": 2.1186141967773438, "learning_rate": 1.7465988621928068e-06, "loss": 4.0655, "step": 1720 }, { "epoch": 1.3604743083003952, "grad_norm": 2.077233076095581, "learning_rate": 1.7453615465095158e-06, "loss": 3.8234, "step": 1721 }, { "epoch": 1.3612648221343875, "grad_norm": 2.3613736629486084, "learning_rate": 1.7441240592631465e-06, "loss": 4.1053, "step": 1722 }, { "epoch": 1.3620553359683796, "grad_norm": 2.1949410438537598, "learning_rate": 1.742886401318982e-06, "loss": 3.9566, "step": 1723 }, { "epoch": 1.3628458498023717, "grad_norm": 2.584557294845581, "learning_rate": 1.7416485735424237e-06, "loss": 3.6819, "step": 1724 }, { "epoch": 1.3636363636363638, "grad_norm": 2.175520420074463, "learning_rate": 1.7404105767989928e-06, "loss": 4.0621, "step": 1725 }, { "epoch": 1.3644268774703558, "grad_norm": 2.034012794494629, "learning_rate": 1.7391724119543285e-06, "loss": 4.2779, "step": 1726 }, { "epoch": 1.365217391304348, "grad_norm": 2.2276575565338135, "learning_rate": 1.737934079874187e-06, "loss": 4.0419, "step": 1727 }, { "epoch": 1.36600790513834, "grad_norm": 2.13376522064209, "learning_rate": 1.7366955814244418e-06, "loss": 4.0855, "step": 1728 }, { "epoch": 1.3667984189723321, "grad_norm": 2.157428503036499, "learning_rate": 1.7354569174710838e-06, "loss": 4.0749, "step": 1729 }, { "epoch": 1.3675889328063242, "grad_norm": 2.1959500312805176, "learning_rate": 1.7342180888802172e-06, "loss": 4.0472, "step": 1730 }, { "epoch": 1.3683794466403163, "grad_norm": 2.0387320518493652, "learning_rate": 1.7329790965180633e-06, "loss": 4.1157, "step": 1731 }, { "epoch": 1.3691699604743084, "grad_norm": 2.2217206954956055, "learning_rate": 1.7317399412509568e-06, "loss": 3.5756, "step": 1732 }, { "epoch": 1.3699604743083005, "grad_norm": 3.145433187484741, "learning_rate": 1.730500623945348e-06, "loss": 4.1139, "step": 1733 }, { "epoch": 1.3707509881422926, "grad_norm": 2.0755865573883057, "learning_rate": 1.7292611454677982e-06, "loss": 4.17, "step": 1734 }, { "epoch": 1.3715415019762847, "grad_norm": 2.260216236114502, "learning_rate": 1.7280215066849825e-06, "loss": 4.1588, "step": 1735 }, { "epoch": 1.3723320158102768, "grad_norm": 2.231184482574463, "learning_rate": 1.7267817084636888e-06, "loss": 3.8545, "step": 1736 }, { "epoch": 1.3731225296442688, "grad_norm": 2.3856887817382812, "learning_rate": 1.7255417516708156e-06, "loss": 3.7007, "step": 1737 }, { "epoch": 1.373913043478261, "grad_norm": 2.121995687484741, "learning_rate": 1.7243016371733716e-06, "loss": 4.1161, "step": 1738 }, { "epoch": 1.374703557312253, "grad_norm": 2.3457818031311035, "learning_rate": 1.7230613658384775e-06, "loss": 3.9045, "step": 1739 }, { "epoch": 1.3754940711462451, "grad_norm": 2.076838493347168, "learning_rate": 1.7218209385333628e-06, "loss": 3.8813, "step": 1740 }, { "epoch": 1.3762845849802372, "grad_norm": 2.2951931953430176, "learning_rate": 1.7205803561253655e-06, "loss": 3.9417, "step": 1741 }, { "epoch": 1.3770750988142293, "grad_norm": 2.172450065612793, "learning_rate": 1.719339619481933e-06, "loss": 4.1528, "step": 1742 }, { "epoch": 1.3778656126482214, "grad_norm": 2.0974485874176025, "learning_rate": 1.7180987294706202e-06, "loss": 4.1824, "step": 1743 }, { "epoch": 1.3786561264822135, "grad_norm": 2.3199868202209473, "learning_rate": 1.7168576869590887e-06, "loss": 4.0277, "step": 1744 }, { "epoch": 1.3794466403162056, "grad_norm": 2.122138738632202, "learning_rate": 1.7156164928151083e-06, "loss": 4.1204, "step": 1745 }, { "epoch": 1.3802371541501977, "grad_norm": 3.6913304328918457, "learning_rate": 1.7143751479065523e-06, "loss": 3.9596, "step": 1746 }, { "epoch": 1.3810276679841897, "grad_norm": 2.1176788806915283, "learning_rate": 1.713133653101402e-06, "loss": 4.1401, "step": 1747 }, { "epoch": 1.3818181818181818, "grad_norm": 3.1229753494262695, "learning_rate": 1.7118920092677412e-06, "loss": 3.9205, "step": 1748 }, { "epoch": 1.382608695652174, "grad_norm": 2.0879838466644287, "learning_rate": 1.7106502172737605e-06, "loss": 4.1505, "step": 1749 }, { "epoch": 1.383399209486166, "grad_norm": 2.356067657470703, "learning_rate": 1.7094082779877515e-06, "loss": 3.9285, "step": 1750 }, { "epoch": 1.384189723320158, "grad_norm": 2.014065742492676, "learning_rate": 1.7081661922781105e-06, "loss": 4.1837, "step": 1751 }, { "epoch": 1.3849802371541502, "grad_norm": 2.2307145595550537, "learning_rate": 1.7069239610133358e-06, "loss": 3.8254, "step": 1752 }, { "epoch": 1.3857707509881423, "grad_norm": 2.3053534030914307, "learning_rate": 1.7056815850620268e-06, "loss": 4.0406, "step": 1753 }, { "epoch": 1.3865612648221344, "grad_norm": 2.702393054962158, "learning_rate": 1.7044390652928848e-06, "loss": 3.6865, "step": 1754 }, { "epoch": 1.3873517786561265, "grad_norm": 2.415476083755493, "learning_rate": 1.703196402574712e-06, "loss": 3.7868, "step": 1755 }, { "epoch": 1.3881422924901186, "grad_norm": 2.209779739379883, "learning_rate": 1.7019535977764093e-06, "loss": 4.1678, "step": 1756 }, { "epoch": 1.3889328063241106, "grad_norm": 2.537691354751587, "learning_rate": 1.7007106517669776e-06, "loss": 3.5253, "step": 1757 }, { "epoch": 1.3897233201581027, "grad_norm": 2.2333931922912598, "learning_rate": 1.6994675654155173e-06, "loss": 3.8026, "step": 1758 }, { "epoch": 1.3905138339920948, "grad_norm": 2.0994174480438232, "learning_rate": 1.6982243395912254e-06, "loss": 4.1067, "step": 1759 }, { "epoch": 1.391304347826087, "grad_norm": 2.1843862533569336, "learning_rate": 1.6969809751633979e-06, "loss": 3.7048, "step": 1760 }, { "epoch": 1.392094861660079, "grad_norm": 2.1379568576812744, "learning_rate": 1.6957374730014266e-06, "loss": 4.1461, "step": 1761 }, { "epoch": 1.392885375494071, "grad_norm": 2.3214306831359863, "learning_rate": 1.6944938339748003e-06, "loss": 3.7565, "step": 1762 }, { "epoch": 1.3936758893280632, "grad_norm": 2.1827213764190674, "learning_rate": 1.6932500589531035e-06, "loss": 3.9885, "step": 1763 }, { "epoch": 1.3944664031620553, "grad_norm": 2.2300679683685303, "learning_rate": 1.6920061488060147e-06, "loss": 3.6691, "step": 1764 }, { "epoch": 1.3952569169960474, "grad_norm": 2.574984312057495, "learning_rate": 1.6907621044033087e-06, "loss": 3.666, "step": 1765 }, { "epoch": 1.3960474308300395, "grad_norm": 2.1484534740448, "learning_rate": 1.6895179266148526e-06, "loss": 3.9606, "step": 1766 }, { "epoch": 1.3968379446640315, "grad_norm": 3.8874404430389404, "learning_rate": 1.6882736163106076e-06, "loss": 3.6272, "step": 1767 }, { "epoch": 1.3976284584980236, "grad_norm": 2.2103943824768066, "learning_rate": 1.6870291743606275e-06, "loss": 3.8943, "step": 1768 }, { "epoch": 1.3984189723320157, "grad_norm": 2.154387950897217, "learning_rate": 1.685784601635058e-06, "loss": 4.0565, "step": 1769 }, { "epoch": 1.3992094861660078, "grad_norm": 2.1303975582122803, "learning_rate": 1.6845398990041356e-06, "loss": 4.0606, "step": 1770 }, { "epoch": 1.4, "grad_norm": 2.2948007583618164, "learning_rate": 1.6832950673381892e-06, "loss": 4.0299, "step": 1771 }, { "epoch": 1.400790513833992, "grad_norm": 2.10387921333313, "learning_rate": 1.682050107507636e-06, "loss": 4.1624, "step": 1772 }, { "epoch": 1.401581027667984, "grad_norm": 2.2026209831237793, "learning_rate": 1.6808050203829845e-06, "loss": 3.6009, "step": 1773 }, { "epoch": 1.4023715415019762, "grad_norm": 2.3264002799987793, "learning_rate": 1.6795598068348317e-06, "loss": 4.2402, "step": 1774 }, { "epoch": 1.4031620553359683, "grad_norm": 2.365058422088623, "learning_rate": 1.678314467733862e-06, "loss": 4.3391, "step": 1775 }, { "epoch": 1.4039525691699604, "grad_norm": 2.298062324523926, "learning_rate": 1.6770690039508488e-06, "loss": 3.9019, "step": 1776 }, { "epoch": 1.4047430830039525, "grad_norm": 1.9263503551483154, "learning_rate": 1.6758234163566527e-06, "loss": 4.2523, "step": 1777 }, { "epoch": 1.4055335968379445, "grad_norm": 3.2223970890045166, "learning_rate": 1.67457770582222e-06, "loss": 4.0039, "step": 1778 }, { "epoch": 1.4063241106719366, "grad_norm": 2.2051730155944824, "learning_rate": 1.6733318732185835e-06, "loss": 3.966, "step": 1779 }, { "epoch": 1.4071146245059287, "grad_norm": 2.081047534942627, "learning_rate": 1.6720859194168606e-06, "loss": 3.9433, "step": 1780 }, { "epoch": 1.4079051383399208, "grad_norm": 2.3379781246185303, "learning_rate": 1.6708398452882554e-06, "loss": 3.8332, "step": 1781 }, { "epoch": 1.4086956521739131, "grad_norm": 2.1522746086120605, "learning_rate": 1.669593651704054e-06, "loss": 3.9352, "step": 1782 }, { "epoch": 1.4094861660079052, "grad_norm": 1.9626504182815552, "learning_rate": 1.668347339535627e-06, "loss": 4.1852, "step": 1783 }, { "epoch": 1.4102766798418973, "grad_norm": 2.0499091148376465, "learning_rate": 1.6671009096544281e-06, "loss": 4.2018, "step": 1784 }, { "epoch": 1.4110671936758894, "grad_norm": 2.076146125793457, "learning_rate": 1.6658543629319927e-06, "loss": 4.273, "step": 1785 }, { "epoch": 1.4118577075098815, "grad_norm": 2.213433027267456, "learning_rate": 1.6646077002399383e-06, "loss": 4.205, "step": 1786 }, { "epoch": 1.4126482213438736, "grad_norm": 2.5018978118896484, "learning_rate": 1.6633609224499631e-06, "loss": 3.3476, "step": 1787 }, { "epoch": 1.4134387351778657, "grad_norm": 2.299819231033325, "learning_rate": 1.6621140304338469e-06, "loss": 3.9833, "step": 1788 }, { "epoch": 1.4142292490118578, "grad_norm": 2.070673942565918, "learning_rate": 1.6608670250634474e-06, "loss": 4.0465, "step": 1789 }, { "epoch": 1.4150197628458498, "grad_norm": 2.2830088138580322, "learning_rate": 1.6596199072107037e-06, "loss": 3.8682, "step": 1790 }, { "epoch": 1.415810276679842, "grad_norm": 2.1053898334503174, "learning_rate": 1.6583726777476321e-06, "loss": 3.9945, "step": 1791 }, { "epoch": 1.416600790513834, "grad_norm": 2.1966676712036133, "learning_rate": 1.6571253375463275e-06, "loss": 4.2226, "step": 1792 }, { "epoch": 1.4173913043478261, "grad_norm": 2.109790325164795, "learning_rate": 1.6558778874789614e-06, "loss": 4.0625, "step": 1793 }, { "epoch": 1.4181818181818182, "grad_norm": 2.2221992015838623, "learning_rate": 1.6546303284177839e-06, "loss": 3.7871, "step": 1794 }, { "epoch": 1.4189723320158103, "grad_norm": 2.579231023788452, "learning_rate": 1.6533826612351198e-06, "loss": 3.8191, "step": 1795 }, { "epoch": 1.4197628458498024, "grad_norm": 2.2435574531555176, "learning_rate": 1.6521348868033698e-06, "loss": 3.8664, "step": 1796 }, { "epoch": 1.4205533596837945, "grad_norm": 2.0391318798065186, "learning_rate": 1.6508870059950102e-06, "loss": 4.0478, "step": 1797 }, { "epoch": 1.4213438735177866, "grad_norm": 2.3039498329162598, "learning_rate": 1.6496390196825909e-06, "loss": 3.8409, "step": 1798 }, { "epoch": 1.4221343873517787, "grad_norm": 2.3221116065979004, "learning_rate": 1.6483909287387356e-06, "loss": 3.8623, "step": 1799 }, { "epoch": 1.4229249011857708, "grad_norm": 2.171067953109741, "learning_rate": 1.6471427340361421e-06, "loss": 4.2021, "step": 1800 }, { "epoch": 1.4237154150197628, "grad_norm": 2.090212821960449, "learning_rate": 1.64589443644758e-06, "loss": 4.1296, "step": 1801 }, { "epoch": 1.424505928853755, "grad_norm": 2.2718067169189453, "learning_rate": 1.6446460368458905e-06, "loss": 3.8109, "step": 1802 }, { "epoch": 1.425296442687747, "grad_norm": 2.4200921058654785, "learning_rate": 1.643397536103987e-06, "loss": 3.7603, "step": 1803 }, { "epoch": 1.4260869565217391, "grad_norm": 2.208658218383789, "learning_rate": 1.642148935094853e-06, "loss": 3.9873, "step": 1804 }, { "epoch": 1.4268774703557312, "grad_norm": 2.0292537212371826, "learning_rate": 1.6409002346915423e-06, "loss": 3.7191, "step": 1805 }, { "epoch": 1.4276679841897233, "grad_norm": 2.435991048812866, "learning_rate": 1.6396514357671781e-06, "loss": 3.8735, "step": 1806 }, { "epoch": 1.4284584980237154, "grad_norm": 2.1654133796691895, "learning_rate": 1.6384025391949532e-06, "loss": 4.3186, "step": 1807 }, { "epoch": 1.4292490118577075, "grad_norm": 2.27652907371521, "learning_rate": 1.6371535458481274e-06, "loss": 4.1398, "step": 1808 }, { "epoch": 1.4300395256916996, "grad_norm": 2.1965792179107666, "learning_rate": 1.6359044566000288e-06, "loss": 3.7692, "step": 1809 }, { "epoch": 1.4308300395256917, "grad_norm": 2.079087734222412, "learning_rate": 1.6346552723240532e-06, "loss": 4.1173, "step": 1810 }, { "epoch": 1.4316205533596837, "grad_norm": 2.4007673263549805, "learning_rate": 1.633405993893662e-06, "loss": 4.1585, "step": 1811 }, { "epoch": 1.4324110671936758, "grad_norm": 2.180464506149292, "learning_rate": 1.6321566221823823e-06, "loss": 4.0973, "step": 1812 }, { "epoch": 1.433201581027668, "grad_norm": 2.82206654548645, "learning_rate": 1.6309071580638073e-06, "loss": 3.5954, "step": 1813 }, { "epoch": 1.43399209486166, "grad_norm": 1.944183349609375, "learning_rate": 1.6296576024115943e-06, "loss": 4.2177, "step": 1814 }, { "epoch": 1.434782608695652, "grad_norm": 2.178352117538452, "learning_rate": 1.6284079560994637e-06, "loss": 4.0823, "step": 1815 }, { "epoch": 1.4355731225296442, "grad_norm": 2.1748101711273193, "learning_rate": 1.6271582200012016e-06, "loss": 3.8262, "step": 1816 }, { "epoch": 1.4363636363636363, "grad_norm": 2.7297515869140625, "learning_rate": 1.6259083949906547e-06, "loss": 4.0711, "step": 1817 }, { "epoch": 1.4371541501976284, "grad_norm": 2.0792958736419678, "learning_rate": 1.6246584819417326e-06, "loss": 4.2879, "step": 1818 }, { "epoch": 1.4379446640316207, "grad_norm": 2.2757489681243896, "learning_rate": 1.623408481728407e-06, "loss": 3.6973, "step": 1819 }, { "epoch": 1.4387351778656128, "grad_norm": 2.138692617416382, "learning_rate": 1.62215839522471e-06, "loss": 4.2678, "step": 1820 }, { "epoch": 1.4395256916996049, "grad_norm": 2.15533185005188, "learning_rate": 1.6209082233047335e-06, "loss": 4.2872, "step": 1821 }, { "epoch": 1.440316205533597, "grad_norm": 1.99635910987854, "learning_rate": 1.6196579668426302e-06, "loss": 4.2851, "step": 1822 }, { "epoch": 1.441106719367589, "grad_norm": 3.0738561153411865, "learning_rate": 1.6184076267126115e-06, "loss": 3.9814, "step": 1823 }, { "epoch": 1.4418972332015811, "grad_norm": 2.5026254653930664, "learning_rate": 1.617157203788947e-06, "loss": 3.9465, "step": 1824 }, { "epoch": 1.4426877470355732, "grad_norm": 2.0331084728240967, "learning_rate": 1.6159066989459645e-06, "loss": 4.1008, "step": 1825 }, { "epoch": 1.4434782608695653, "grad_norm": 2.1927943229675293, "learning_rate": 1.6146561130580494e-06, "loss": 3.9478, "step": 1826 }, { "epoch": 1.4442687747035574, "grad_norm": 2.19948148727417, "learning_rate": 1.6134054469996428e-06, "loss": 3.9824, "step": 1827 }, { "epoch": 1.4450592885375495, "grad_norm": 1.9567893743515015, "learning_rate": 1.6121547016452425e-06, "loss": 4.3894, "step": 1828 }, { "epoch": 1.4458498023715416, "grad_norm": 2.02779221534729, "learning_rate": 1.6109038778694021e-06, "loss": 4.2801, "step": 1829 }, { "epoch": 1.4466403162055337, "grad_norm": 2.4502735137939453, "learning_rate": 1.6096529765467298e-06, "loss": 4.0145, "step": 1830 }, { "epoch": 1.4474308300395258, "grad_norm": 2.3274290561676025, "learning_rate": 1.608401998551887e-06, "loss": 3.9288, "step": 1831 }, { "epoch": 1.4482213438735179, "grad_norm": 2.000995635986328, "learning_rate": 1.6071509447595905e-06, "loss": 4.1252, "step": 1832 }, { "epoch": 1.44901185770751, "grad_norm": 2.769634962081909, "learning_rate": 1.605899816044608e-06, "loss": 4.2747, "step": 1833 }, { "epoch": 1.449802371541502, "grad_norm": 2.1255006790161133, "learning_rate": 1.604648613281762e-06, "loss": 4.0224, "step": 1834 }, { "epoch": 1.4505928853754941, "grad_norm": 2.5180177688598633, "learning_rate": 1.6033973373459248e-06, "loss": 3.6482, "step": 1835 }, { "epoch": 1.4513833992094862, "grad_norm": 2.1417758464813232, "learning_rate": 1.6021459891120207e-06, "loss": 4.0867, "step": 1836 }, { "epoch": 1.4521739130434783, "grad_norm": 2.3225550651550293, "learning_rate": 1.6008945694550245e-06, "loss": 3.9734, "step": 1837 }, { "epoch": 1.4529644268774704, "grad_norm": 2.191451072692871, "learning_rate": 1.599643079249961e-06, "loss": 3.9546, "step": 1838 }, { "epoch": 1.4537549407114625, "grad_norm": 2.1131865978240967, "learning_rate": 1.598391519371904e-06, "loss": 4.1122, "step": 1839 }, { "epoch": 1.4545454545454546, "grad_norm": 2.209261417388916, "learning_rate": 1.597139890695976e-06, "loss": 4.2177, "step": 1840 }, { "epoch": 1.4553359683794467, "grad_norm": 2.0593793392181396, "learning_rate": 1.5958881940973486e-06, "loss": 4.3723, "step": 1841 }, { "epoch": 1.4561264822134388, "grad_norm": 2.297184944152832, "learning_rate": 1.5946364304512394e-06, "loss": 3.9394, "step": 1842 }, { "epoch": 1.4569169960474309, "grad_norm": 2.426596164703369, "learning_rate": 1.593384600632914e-06, "loss": 4.1704, "step": 1843 }, { "epoch": 1.457707509881423, "grad_norm": 2.0436413288116455, "learning_rate": 1.5921327055176834e-06, "loss": 4.1948, "step": 1844 }, { "epoch": 1.458498023715415, "grad_norm": 2.4027962684631348, "learning_rate": 1.5908807459809057e-06, "loss": 3.4131, "step": 1845 }, { "epoch": 1.4592885375494071, "grad_norm": 2.2812561988830566, "learning_rate": 1.589628722897982e-06, "loss": 4.0833, "step": 1846 }, { "epoch": 1.4600790513833992, "grad_norm": 2.3450400829315186, "learning_rate": 1.5883766371443592e-06, "loss": 4.2693, "step": 1847 }, { "epoch": 1.4608695652173913, "grad_norm": 2.190075635910034, "learning_rate": 1.587124489595528e-06, "loss": 3.9586, "step": 1848 }, { "epoch": 1.4616600790513834, "grad_norm": 2.16695237159729, "learning_rate": 1.5858722811270224e-06, "loss": 3.8756, "step": 1849 }, { "epoch": 1.4624505928853755, "grad_norm": 2.295473337173462, "learning_rate": 1.5846200126144176e-06, "loss": 4.0226, "step": 1850 }, { "epoch": 1.4632411067193676, "grad_norm": 2.3922343254089355, "learning_rate": 1.5833676849333323e-06, "loss": 3.9753, "step": 1851 }, { "epoch": 1.4640316205533597, "grad_norm": 2.197141170501709, "learning_rate": 1.5821152989594267e-06, "loss": 4.1093, "step": 1852 }, { "epoch": 1.4648221343873518, "grad_norm": 2.1928811073303223, "learning_rate": 1.5808628555684006e-06, "loss": 3.9605, "step": 1853 }, { "epoch": 1.4656126482213438, "grad_norm": 2.103189706802368, "learning_rate": 1.579610355635994e-06, "loss": 4.1911, "step": 1854 }, { "epoch": 1.466403162055336, "grad_norm": 2.191096067428589, "learning_rate": 1.578357800037988e-06, "loss": 4.0326, "step": 1855 }, { "epoch": 1.467193675889328, "grad_norm": 2.0981431007385254, "learning_rate": 1.5771051896502007e-06, "loss": 3.7442, "step": 1856 }, { "epoch": 1.4679841897233201, "grad_norm": 2.206944227218628, "learning_rate": 1.5758525253484898e-06, "loss": 4.0887, "step": 1857 }, { "epoch": 1.4687747035573122, "grad_norm": 1.9874117374420166, "learning_rate": 1.5745998080087503e-06, "loss": 4.0796, "step": 1858 }, { "epoch": 1.4695652173913043, "grad_norm": 2.0597856044769287, "learning_rate": 1.5733470385069143e-06, "loss": 4.1989, "step": 1859 }, { "epoch": 1.4703557312252964, "grad_norm": 2.2087836265563965, "learning_rate": 1.5720942177189495e-06, "loss": 4.0146, "step": 1860 }, { "epoch": 1.4711462450592885, "grad_norm": 2.1900875568389893, "learning_rate": 1.5708413465208612e-06, "loss": 4.0232, "step": 1861 }, { "epoch": 1.4719367588932806, "grad_norm": 2.2797484397888184, "learning_rate": 1.569588425788689e-06, "loss": 3.8964, "step": 1862 }, { "epoch": 1.4727272727272727, "grad_norm": 2.0743815898895264, "learning_rate": 1.5683354563985067e-06, "loss": 4.2711, "step": 1863 }, { "epoch": 1.4735177865612648, "grad_norm": 2.0578970909118652, "learning_rate": 1.567082439226423e-06, "loss": 4.2275, "step": 1864 }, { "epoch": 1.4743083003952568, "grad_norm": 4.247472763061523, "learning_rate": 1.5658293751485798e-06, "loss": 4.1778, "step": 1865 }, { "epoch": 1.475098814229249, "grad_norm": 2.079162120819092, "learning_rate": 1.5645762650411506e-06, "loss": 4.1247, "step": 1866 }, { "epoch": 1.475889328063241, "grad_norm": 2.0468921661376953, "learning_rate": 1.563323109780343e-06, "loss": 4.3934, "step": 1867 }, { "epoch": 1.4766798418972331, "grad_norm": 2.1091806888580322, "learning_rate": 1.562069910242395e-06, "loss": 3.9909, "step": 1868 }, { "epoch": 1.4774703557312252, "grad_norm": 2.181609630584717, "learning_rate": 1.560816667303576e-06, "loss": 3.4993, "step": 1869 }, { "epoch": 1.4782608695652173, "grad_norm": 2.2610437870025635, "learning_rate": 1.5595633818401844e-06, "loss": 4.0378, "step": 1870 }, { "epoch": 1.4790513833992094, "grad_norm": 2.1869606971740723, "learning_rate": 1.5583100547285517e-06, "loss": 4.1144, "step": 1871 }, { "epoch": 1.4798418972332015, "grad_norm": 2.2058658599853516, "learning_rate": 1.5570566868450345e-06, "loss": 3.8504, "step": 1872 }, { "epoch": 1.4806324110671936, "grad_norm": 2.19874906539917, "learning_rate": 1.5558032790660202e-06, "loss": 4.0596, "step": 1873 }, { "epoch": 1.4814229249011857, "grad_norm": 2.2898952960968018, "learning_rate": 1.5545498322679238e-06, "loss": 3.8403, "step": 1874 }, { "epoch": 1.4822134387351777, "grad_norm": 2.1865227222442627, "learning_rate": 1.5532963473271879e-06, "loss": 3.919, "step": 1875 }, { "epoch": 1.4830039525691698, "grad_norm": 2.038257360458374, "learning_rate": 1.5520428251202799e-06, "loss": 4.15, "step": 1876 }, { "epoch": 1.483794466403162, "grad_norm": 2.729180335998535, "learning_rate": 1.550789266523696e-06, "loss": 3.6827, "step": 1877 }, { "epoch": 1.484584980237154, "grad_norm": 2.1017556190490723, "learning_rate": 1.549535672413956e-06, "loss": 4.2081, "step": 1878 }, { "epoch": 1.485375494071146, "grad_norm": 2.148892641067505, "learning_rate": 1.5482820436676047e-06, "loss": 3.9997, "step": 1879 }, { "epoch": 1.4861660079051384, "grad_norm": 2.240356922149658, "learning_rate": 1.5470283811612117e-06, "loss": 3.8141, "step": 1880 }, { "epoch": 1.4869565217391305, "grad_norm": 1.9139666557312012, "learning_rate": 1.5457746857713703e-06, "loss": 4.4561, "step": 1881 }, { "epoch": 1.4877470355731226, "grad_norm": 2.2500898838043213, "learning_rate": 1.5445209583746956e-06, "loss": 4.1148, "step": 1882 }, { "epoch": 1.4885375494071147, "grad_norm": 2.058666706085205, "learning_rate": 1.5432671998478266e-06, "loss": 4.0431, "step": 1883 }, { "epoch": 1.4893280632411068, "grad_norm": 2.1115708351135254, "learning_rate": 1.542013411067423e-06, "loss": 4.0569, "step": 1884 }, { "epoch": 1.4901185770750989, "grad_norm": 2.2057676315307617, "learning_rate": 1.5407595929101669e-06, "loss": 3.9994, "step": 1885 }, { "epoch": 1.490909090909091, "grad_norm": 2.2587952613830566, "learning_rate": 1.5395057462527584e-06, "loss": 3.9279, "step": 1886 }, { "epoch": 1.491699604743083, "grad_norm": 2.210808515548706, "learning_rate": 1.5382518719719203e-06, "loss": 4.2165, "step": 1887 }, { "epoch": 1.4924901185770751, "grad_norm": 2.397019624710083, "learning_rate": 1.5369979709443938e-06, "loss": 4.3268, "step": 1888 }, { "epoch": 1.4932806324110672, "grad_norm": 2.10331392288208, "learning_rate": 1.535744044046938e-06, "loss": 4.1574, "step": 1889 }, { "epoch": 1.4940711462450593, "grad_norm": 2.274280071258545, "learning_rate": 1.5344900921563313e-06, "loss": 4.1136, "step": 1890 }, { "epoch": 1.4948616600790514, "grad_norm": 2.095618486404419, "learning_rate": 1.5332361161493689e-06, "loss": 3.8159, "step": 1891 }, { "epoch": 1.4956521739130435, "grad_norm": 2.204226016998291, "learning_rate": 1.531982116902862e-06, "loss": 3.8329, "step": 1892 }, { "epoch": 1.4964426877470356, "grad_norm": 2.0086472034454346, "learning_rate": 1.5307280952936402e-06, "loss": 4.4671, "step": 1893 }, { "epoch": 1.4972332015810277, "grad_norm": 1.9549239873886108, "learning_rate": 1.5294740521985473e-06, "loss": 4.3904, "step": 1894 }, { "epoch": 1.4980237154150198, "grad_norm": 2.127422571182251, "learning_rate": 1.5282199884944416e-06, "loss": 4.3053, "step": 1895 }, { "epoch": 1.4988142292490119, "grad_norm": 2.288257360458374, "learning_rate": 1.5269659050581971e-06, "loss": 3.8804, "step": 1896 }, { "epoch": 1.499604743083004, "grad_norm": 2.1310060024261475, "learning_rate": 1.5257118027667013e-06, "loss": 4.061, "step": 1897 }, { "epoch": 1.500395256916996, "grad_norm": 2.372548818588257, "learning_rate": 1.524457682496854e-06, "loss": 3.8306, "step": 1898 }, { "epoch": 1.5011857707509881, "grad_norm": 2.379394054412842, "learning_rate": 1.523203545125569e-06, "loss": 3.5727, "step": 1899 }, { "epoch": 1.5019762845849802, "grad_norm": 2.5512900352478027, "learning_rate": 1.5219493915297707e-06, "loss": 4.0802, "step": 1900 }, { "epoch": 1.5027667984189723, "grad_norm": 2.0677781105041504, "learning_rate": 1.5206952225863958e-06, "loss": 4.3595, "step": 1901 }, { "epoch": 1.5035573122529644, "grad_norm": 2.0619170665740967, "learning_rate": 1.519441039172391e-06, "loss": 4.1814, "step": 1902 }, { "epoch": 1.5043478260869565, "grad_norm": 2.098423480987549, "learning_rate": 1.5181868421647134e-06, "loss": 4.0834, "step": 1903 }, { "epoch": 1.5051383399209486, "grad_norm": 2.0727035999298096, "learning_rate": 1.5169326324403303e-06, "loss": 4.2544, "step": 1904 }, { "epoch": 1.5059288537549407, "grad_norm": 2.066469192504883, "learning_rate": 1.5156784108762162e-06, "loss": 4.2529, "step": 1905 }, { "epoch": 1.5067193675889328, "grad_norm": 1.9312422275543213, "learning_rate": 1.5144241783493565e-06, "loss": 4.3842, "step": 1906 }, { "epoch": 1.5075098814229249, "grad_norm": 2.5267398357391357, "learning_rate": 1.5131699357367413e-06, "loss": 3.937, "step": 1907 }, { "epoch": 1.508300395256917, "grad_norm": 2.1301748752593994, "learning_rate": 1.5119156839153696e-06, "loss": 4.2629, "step": 1908 }, { "epoch": 1.509090909090909, "grad_norm": 2.421973943710327, "learning_rate": 1.5106614237622464e-06, "loss": 4.3756, "step": 1909 }, { "epoch": 1.5098814229249014, "grad_norm": 2.2632274627685547, "learning_rate": 1.509407156154383e-06, "loss": 3.8088, "step": 1910 }, { "epoch": 1.5106719367588934, "grad_norm": 2.52473783493042, "learning_rate": 1.508152881968795e-06, "loss": 4.0098, "step": 1911 }, { "epoch": 1.5114624505928855, "grad_norm": 2.0828399658203125, "learning_rate": 1.5068986020825028e-06, "loss": 4.1674, "step": 1912 }, { "epoch": 1.5122529644268776, "grad_norm": 2.1370866298675537, "learning_rate": 1.5056443173725314e-06, "loss": 4.0257, "step": 1913 }, { "epoch": 1.5130434782608697, "grad_norm": 2.144176483154297, "learning_rate": 1.504390028715909e-06, "loss": 3.8711, "step": 1914 }, { "epoch": 1.5138339920948618, "grad_norm": 2.75618314743042, "learning_rate": 1.5031357369896656e-06, "loss": 4.0716, "step": 1915 }, { "epoch": 1.514624505928854, "grad_norm": 2.2360191345214844, "learning_rate": 1.5018814430708351e-06, "loss": 3.991, "step": 1916 }, { "epoch": 1.515415019762846, "grad_norm": 2.106735944747925, "learning_rate": 1.5006271478364512e-06, "loss": 4.2269, "step": 1917 }, { "epoch": 1.516205533596838, "grad_norm": 2.4452455043792725, "learning_rate": 1.4993728521635493e-06, "loss": 3.3737, "step": 1918 }, { "epoch": 1.5169960474308302, "grad_norm": 2.2610456943511963, "learning_rate": 1.4981185569291656e-06, "loss": 3.9919, "step": 1919 }, { "epoch": 1.5177865612648223, "grad_norm": 2.229466676712036, "learning_rate": 1.4968642630103347e-06, "loss": 3.655, "step": 1920 }, { "epoch": 1.5185770750988143, "grad_norm": 1.9514448642730713, "learning_rate": 1.495609971284091e-06, "loss": 4.2416, "step": 1921 }, { "epoch": 1.5193675889328064, "grad_norm": 2.3061444759368896, "learning_rate": 1.494355682627469e-06, "loss": 3.9485, "step": 1922 }, { "epoch": 1.5201581027667985, "grad_norm": 2.1779561042785645, "learning_rate": 1.4931013979174974e-06, "loss": 3.8531, "step": 1923 }, { "epoch": 1.5209486166007906, "grad_norm": 2.136265754699707, "learning_rate": 1.4918471180312054e-06, "loss": 3.9543, "step": 1924 }, { "epoch": 1.5217391304347827, "grad_norm": 2.096207857131958, "learning_rate": 1.4905928438456173e-06, "loss": 4.0624, "step": 1925 }, { "epoch": 1.5225296442687748, "grad_norm": 2.195760726928711, "learning_rate": 1.4893385762377537e-06, "loss": 4.1318, "step": 1926 }, { "epoch": 1.5233201581027669, "grad_norm": 2.3816516399383545, "learning_rate": 1.4880843160846305e-06, "loss": 3.987, "step": 1927 }, { "epoch": 1.524110671936759, "grad_norm": 4.9112868309021, "learning_rate": 1.4868300642632594e-06, "loss": 3.6826, "step": 1928 }, { "epoch": 1.524901185770751, "grad_norm": 2.5982675552368164, "learning_rate": 1.4855758216506438e-06, "loss": 4.0903, "step": 1929 }, { "epoch": 1.5256916996047432, "grad_norm": 2.186978340148926, "learning_rate": 1.4843215891237837e-06, "loss": 4.2916, "step": 1930 }, { "epoch": 1.5264822134387352, "grad_norm": 2.6186468601226807, "learning_rate": 1.4830673675596704e-06, "loss": 3.6904, "step": 1931 }, { "epoch": 1.5272727272727273, "grad_norm": 4.451548099517822, "learning_rate": 1.4818131578352868e-06, "loss": 4.1079, "step": 1932 }, { "epoch": 1.5280632411067194, "grad_norm": 2.3357746601104736, "learning_rate": 1.4805589608276095e-06, "loss": 3.7424, "step": 1933 }, { "epoch": 1.5288537549407115, "grad_norm": 2.3098578453063965, "learning_rate": 1.479304777413605e-06, "loss": 4.0385, "step": 1934 }, { "epoch": 1.5296442687747036, "grad_norm": 2.019561290740967, "learning_rate": 1.4780506084702293e-06, "loss": 4.2245, "step": 1935 }, { "epoch": 1.5304347826086957, "grad_norm": 2.4027352333068848, "learning_rate": 1.476796454874431e-06, "loss": 4.1533, "step": 1936 }, { "epoch": 1.5312252964426878, "grad_norm": 2.240713596343994, "learning_rate": 1.4755423175031462e-06, "loss": 4.0489, "step": 1937 }, { "epoch": 1.5320158102766799, "grad_norm": 2.2390828132629395, "learning_rate": 1.474288197233299e-06, "loss": 4.0977, "step": 1938 }, { "epoch": 1.532806324110672, "grad_norm": 2.1001317501068115, "learning_rate": 1.473034094941803e-06, "loss": 3.989, "step": 1939 }, { "epoch": 1.533596837944664, "grad_norm": 2.0650293827056885, "learning_rate": 1.471780011505559e-06, "loss": 4.5019, "step": 1940 }, { "epoch": 1.5343873517786562, "grad_norm": 2.192596673965454, "learning_rate": 1.4705259478014532e-06, "loss": 3.945, "step": 1941 }, { "epoch": 1.5351778656126482, "grad_norm": 2.3504724502563477, "learning_rate": 1.4692719047063596e-06, "loss": 3.9248, "step": 1942 }, { "epoch": 1.5359683794466403, "grad_norm": 2.450995445251465, "learning_rate": 1.468017883097138e-06, "loss": 3.8408, "step": 1943 }, { "epoch": 1.5367588932806324, "grad_norm": 2.5090839862823486, "learning_rate": 1.4667638838506316e-06, "loss": 3.8348, "step": 1944 }, { "epoch": 1.5375494071146245, "grad_norm": 2.1799843311309814, "learning_rate": 1.4655099078436688e-06, "loss": 4.2279, "step": 1945 }, { "epoch": 1.5383399209486166, "grad_norm": 2.121929407119751, "learning_rate": 1.4642559559530623e-06, "loss": 4.1858, "step": 1946 }, { "epoch": 1.5391304347826087, "grad_norm": 2.5614986419677734, "learning_rate": 1.4630020290556065e-06, "loss": 4.2079, "step": 1947 }, { "epoch": 1.5399209486166008, "grad_norm": 2.245854616165161, "learning_rate": 1.4617481280280802e-06, "loss": 3.8806, "step": 1948 }, { "epoch": 1.5407114624505929, "grad_norm": 2.129379987716675, "learning_rate": 1.4604942537472417e-06, "loss": 4.0841, "step": 1949 }, { "epoch": 1.541501976284585, "grad_norm": 1.9740551710128784, "learning_rate": 1.4592404070898336e-06, "loss": 4.2725, "step": 1950 }, { "epoch": 1.542292490118577, "grad_norm": 2.2241764068603516, "learning_rate": 1.4579865889325772e-06, "loss": 3.9264, "step": 1951 }, { "epoch": 1.5430830039525691, "grad_norm": 2.152959108352661, "learning_rate": 1.4567328001521736e-06, "loss": 4.1613, "step": 1952 }, { "epoch": 1.5438735177865612, "grad_norm": 2.373901128768921, "learning_rate": 1.4554790416253045e-06, "loss": 4.0078, "step": 1953 }, { "epoch": 1.5446640316205533, "grad_norm": 2.203407049179077, "learning_rate": 1.4542253142286304e-06, "loss": 3.9894, "step": 1954 }, { "epoch": 1.5454545454545454, "grad_norm": 2.4243783950805664, "learning_rate": 1.4529716188387886e-06, "loss": 3.7838, "step": 1955 }, { "epoch": 1.5462450592885375, "grad_norm": 2.2188034057617188, "learning_rate": 1.4517179563323953e-06, "loss": 4.2645, "step": 1956 }, { "epoch": 1.5470355731225296, "grad_norm": 2.7041921615600586, "learning_rate": 1.4504643275860443e-06, "loss": 3.9219, "step": 1957 }, { "epoch": 1.5478260869565217, "grad_norm": 2.339592218399048, "learning_rate": 1.449210733476304e-06, "loss": 3.9332, "step": 1958 }, { "epoch": 1.5486166007905138, "grad_norm": 2.509976863861084, "learning_rate": 1.4479571748797202e-06, "loss": 4.1587, "step": 1959 }, { "epoch": 1.5494071146245059, "grad_norm": 2.5146331787109375, "learning_rate": 1.4467036526728128e-06, "loss": 3.7534, "step": 1960 }, { "epoch": 1.550197628458498, "grad_norm": 2.518272876739502, "learning_rate": 1.4454501677320763e-06, "loss": 4.0608, "step": 1961 }, { "epoch": 1.55098814229249, "grad_norm": 2.18310546875, "learning_rate": 1.4441967209339796e-06, "loss": 4.1589, "step": 1962 }, { "epoch": 1.5517786561264821, "grad_norm": 2.1625308990478516, "learning_rate": 1.4429433131549662e-06, "loss": 3.8384, "step": 1963 }, { "epoch": 1.5525691699604742, "grad_norm": 2.284661293029785, "learning_rate": 1.4416899452714488e-06, "loss": 3.6869, "step": 1964 }, { "epoch": 1.5533596837944663, "grad_norm": 1.9884685277938843, "learning_rate": 1.4404366181598153e-06, "loss": 4.1729, "step": 1965 }, { "epoch": 1.5541501976284584, "grad_norm": 2.1866540908813477, "learning_rate": 1.4391833326964248e-06, "loss": 4.2632, "step": 1966 }, { "epoch": 1.5549407114624505, "grad_norm": 2.1878116130828857, "learning_rate": 1.4379300897576053e-06, "loss": 4.1008, "step": 1967 }, { "epoch": 1.5557312252964426, "grad_norm": 2.1707699298858643, "learning_rate": 1.436676890219657e-06, "loss": 3.9708, "step": 1968 }, { "epoch": 1.5565217391304347, "grad_norm": 2.2397191524505615, "learning_rate": 1.43542373495885e-06, "loss": 3.9229, "step": 1969 }, { "epoch": 1.5573122529644268, "grad_norm": 2.193763017654419, "learning_rate": 1.4341706248514205e-06, "loss": 3.8068, "step": 1970 }, { "epoch": 1.5581027667984189, "grad_norm": 2.4402265548706055, "learning_rate": 1.4329175607735768e-06, "loss": 3.8942, "step": 1971 }, { "epoch": 1.558893280632411, "grad_norm": 2.4148077964782715, "learning_rate": 1.4316645436014936e-06, "loss": 4.0595, "step": 1972 }, { "epoch": 1.559683794466403, "grad_norm": 2.4417176246643066, "learning_rate": 1.4304115742113112e-06, "loss": 4.1149, "step": 1973 }, { "epoch": 1.5604743083003951, "grad_norm": 2.148787498474121, "learning_rate": 1.4291586534791386e-06, "loss": 4.0254, "step": 1974 }, { "epoch": 1.5612648221343872, "grad_norm": 2.9160385131835938, "learning_rate": 1.427905782281051e-06, "loss": 3.7859, "step": 1975 }, { "epoch": 1.5620553359683793, "grad_norm": 2.074892520904541, "learning_rate": 1.4266529614930862e-06, "loss": 4.1842, "step": 1976 }, { "epoch": 1.5628458498023714, "grad_norm": 2.53354549407959, "learning_rate": 1.4254001919912497e-06, "loss": 3.8358, "step": 1977 }, { "epoch": 1.5636363636363635, "grad_norm": 2.647312879562378, "learning_rate": 1.4241474746515103e-06, "loss": 3.786, "step": 1978 }, { "epoch": 1.5644268774703556, "grad_norm": 2.286090612411499, "learning_rate": 1.4228948103497996e-06, "loss": 3.9896, "step": 1979 }, { "epoch": 1.5652173913043477, "grad_norm": 2.1607344150543213, "learning_rate": 1.4216421999620127e-06, "loss": 4.079, "step": 1980 }, { "epoch": 1.5660079051383398, "grad_norm": 2.422759532928467, "learning_rate": 1.4203896443640064e-06, "loss": 3.8003, "step": 1981 }, { "epoch": 1.5667984189723319, "grad_norm": 2.1557857990264893, "learning_rate": 1.4191371444315999e-06, "loss": 4.0349, "step": 1982 }, { "epoch": 1.5675889328063242, "grad_norm": 2.127131223678589, "learning_rate": 1.417884701040574e-06, "loss": 3.8236, "step": 1983 }, { "epoch": 1.5683794466403163, "grad_norm": 2.117363929748535, "learning_rate": 1.4166323150666678e-06, "loss": 4.1551, "step": 1984 }, { "epoch": 1.5691699604743083, "grad_norm": 2.3556230068206787, "learning_rate": 1.4153799873855827e-06, "loss": 4.0859, "step": 1985 }, { "epoch": 1.5699604743083004, "grad_norm": 2.3974735736846924, "learning_rate": 1.4141277188729785e-06, "loss": 4.1743, "step": 1986 }, { "epoch": 1.5707509881422925, "grad_norm": 1.9857592582702637, "learning_rate": 1.4128755104044724e-06, "loss": 4.2917, "step": 1987 }, { "epoch": 1.5715415019762846, "grad_norm": 2.4958808422088623, "learning_rate": 1.411623362855641e-06, "loss": 3.7277, "step": 1988 }, { "epoch": 1.5723320158102767, "grad_norm": 1.9082438945770264, "learning_rate": 1.410371277102019e-06, "loss": 4.301, "step": 1989 }, { "epoch": 1.5731225296442688, "grad_norm": 2.387601613998413, "learning_rate": 1.4091192540190948e-06, "loss": 3.934, "step": 1990 }, { "epoch": 1.5739130434782609, "grad_norm": 2.15490984916687, "learning_rate": 1.4078672944823167e-06, "loss": 4.1413, "step": 1991 }, { "epoch": 1.574703557312253, "grad_norm": 2.093170166015625, "learning_rate": 1.4066153993670866e-06, "loss": 4.4017, "step": 1992 }, { "epoch": 1.575494071146245, "grad_norm": 2.213327169418335, "learning_rate": 1.4053635695487609e-06, "loss": 4.1786, "step": 1993 }, { "epoch": 1.5762845849802372, "grad_norm": 2.7043721675872803, "learning_rate": 1.4041118059026519e-06, "loss": 3.802, "step": 1994 }, { "epoch": 1.5770750988142292, "grad_norm": 2.4317667484283447, "learning_rate": 1.4028601093040242e-06, "loss": 4.0514, "step": 1995 }, { "epoch": 1.5778656126482213, "grad_norm": 2.6386773586273193, "learning_rate": 1.4016084806280962e-06, "loss": 4.0987, "step": 1996 }, { "epoch": 1.5786561264822134, "grad_norm": 2.060713529586792, "learning_rate": 1.4003569207500392e-06, "loss": 3.9846, "step": 1997 }, { "epoch": 1.5794466403162055, "grad_norm": 2.196052074432373, "learning_rate": 1.3991054305449758e-06, "loss": 3.8613, "step": 1998 }, { "epoch": 1.5802371541501976, "grad_norm": 2.2631828784942627, "learning_rate": 1.3978540108879796e-06, "loss": 4.0331, "step": 1999 }, { "epoch": 1.5810276679841897, "grad_norm": 2.176809549331665, "learning_rate": 1.3966026626540753e-06, "loss": 3.9105, "step": 2000 }, { "epoch": 1.5810276679841897, "eval_loss": 3.9327237606048584, "eval_runtime": 3.8498, "eval_samples_per_second": 779.264, "eval_steps_per_second": 3.377, "step": 2000 }, { "epoch": 1.5818181818181818, "grad_norm": 2.4514968395233154, "learning_rate": 1.3953513867182385e-06, "loss": 4.2819, "step": 2001 }, { "epoch": 1.5826086956521739, "grad_norm": 2.2768301963806152, "learning_rate": 1.3941001839553922e-06, "loss": 3.7832, "step": 2002 }, { "epoch": 1.583399209486166, "grad_norm": 2.182016134262085, "learning_rate": 1.3928490552404095e-06, "loss": 4.0402, "step": 2003 }, { "epoch": 1.584189723320158, "grad_norm": 2.536087989807129, "learning_rate": 1.3915980014481132e-06, "loss": 3.806, "step": 2004 }, { "epoch": 1.5849802371541502, "grad_norm": 2.097435712814331, "learning_rate": 1.3903470234532703e-06, "loss": 4.0303, "step": 2005 }, { "epoch": 1.5857707509881422, "grad_norm": 2.092109441757202, "learning_rate": 1.3890961221305977e-06, "loss": 4.355, "step": 2006 }, { "epoch": 1.5865612648221346, "grad_norm": 1.9215307235717773, "learning_rate": 1.3878452983547576e-06, "loss": 4.4324, "step": 2007 }, { "epoch": 1.5873517786561266, "grad_norm": 2.0887176990509033, "learning_rate": 1.3865945530003575e-06, "loss": 4.1932, "step": 2008 }, { "epoch": 1.5881422924901187, "grad_norm": 2.118645429611206, "learning_rate": 1.3853438869419513e-06, "loss": 4.0067, "step": 2009 }, { "epoch": 1.5889328063241108, "grad_norm": 2.7346248626708984, "learning_rate": 1.3840933010540356e-06, "loss": 3.949, "step": 2010 }, { "epoch": 1.589723320158103, "grad_norm": 2.202892541885376, "learning_rate": 1.382842796211053e-06, "loss": 3.7986, "step": 2011 }, { "epoch": 1.590513833992095, "grad_norm": 2.629880905151367, "learning_rate": 1.381592373287389e-06, "loss": 4.1505, "step": 2012 }, { "epoch": 1.591304347826087, "grad_norm": 2.8139100074768066, "learning_rate": 1.38034203315737e-06, "loss": 3.9342, "step": 2013 }, { "epoch": 1.5920948616600792, "grad_norm": 2.1107749938964844, "learning_rate": 1.3790917766952668e-06, "loss": 4.0439, "step": 2014 }, { "epoch": 1.5928853754940713, "grad_norm": 2.5181729793548584, "learning_rate": 1.3778416047752906e-06, "loss": 3.9703, "step": 2015 }, { "epoch": 1.5936758893280634, "grad_norm": 2.151122808456421, "learning_rate": 1.3765915182715933e-06, "loss": 4.0598, "step": 2016 }, { "epoch": 1.5944664031620555, "grad_norm": 2.283740520477295, "learning_rate": 1.3753415180582672e-06, "loss": 3.7734, "step": 2017 }, { "epoch": 1.5952569169960475, "grad_norm": 2.01639986038208, "learning_rate": 1.3740916050093456e-06, "loss": 4.0638, "step": 2018 }, { "epoch": 1.5960474308300396, "grad_norm": 2.0952234268188477, "learning_rate": 1.3728417799987984e-06, "loss": 4.3159, "step": 2019 }, { "epoch": 1.5968379446640317, "grad_norm": 2.235900402069092, "learning_rate": 1.3715920439005364e-06, "loss": 3.6711, "step": 2020 }, { "epoch": 1.5976284584980238, "grad_norm": 2.2527785301208496, "learning_rate": 1.3703423975884065e-06, "loss": 3.7183, "step": 2021 }, { "epoch": 1.598418972332016, "grad_norm": 2.1228911876678467, "learning_rate": 1.369092841936193e-06, "loss": 4.0788, "step": 2022 }, { "epoch": 1.599209486166008, "grad_norm": 2.629908800125122, "learning_rate": 1.3678433778176176e-06, "loss": 3.8072, "step": 2023 }, { "epoch": 1.6, "grad_norm": 2.0660033226013184, "learning_rate": 1.3665940061063387e-06, "loss": 4.1798, "step": 2024 }, { "epoch": 1.6007905138339922, "grad_norm": 2.2987027168273926, "learning_rate": 1.365344727675947e-06, "loss": 3.9408, "step": 2025 }, { "epoch": 1.6015810276679843, "grad_norm": 2.123692750930786, "learning_rate": 1.364095543399971e-06, "loss": 3.942, "step": 2026 }, { "epoch": 1.6023715415019764, "grad_norm": 2.05753231048584, "learning_rate": 1.362846454151873e-06, "loss": 4.1342, "step": 2027 }, { "epoch": 1.6031620553359685, "grad_norm": 2.1798646450042725, "learning_rate": 1.3615974608050471e-06, "loss": 3.9201, "step": 2028 }, { "epoch": 1.6039525691699605, "grad_norm": 2.479008913040161, "learning_rate": 1.3603485642328215e-06, "loss": 3.6665, "step": 2029 }, { "epoch": 1.6047430830039526, "grad_norm": 1.993974208831787, "learning_rate": 1.3590997653084582e-06, "loss": 4.2641, "step": 2030 }, { "epoch": 1.6055335968379447, "grad_norm": 2.0734763145446777, "learning_rate": 1.3578510649051471e-06, "loss": 4.3086, "step": 2031 }, { "epoch": 1.6063241106719368, "grad_norm": 2.3914928436279297, "learning_rate": 1.356602463896013e-06, "loss": 4.1413, "step": 2032 }, { "epoch": 1.607114624505929, "grad_norm": 2.444844961166382, "learning_rate": 1.3553539631541098e-06, "loss": 4.0651, "step": 2033 }, { "epoch": 1.607905138339921, "grad_norm": 2.248107671737671, "learning_rate": 1.3541055635524201e-06, "loss": 4.099, "step": 2034 }, { "epoch": 1.608695652173913, "grad_norm": 2.091277837753296, "learning_rate": 1.3528572659638575e-06, "loss": 4.2416, "step": 2035 }, { "epoch": 1.6094861660079052, "grad_norm": 2.3193933963775635, "learning_rate": 1.3516090712612644e-06, "loss": 3.9971, "step": 2036 }, { "epoch": 1.6102766798418973, "grad_norm": 2.0440573692321777, "learning_rate": 1.3503609803174092e-06, "loss": 4.2977, "step": 2037 }, { "epoch": 1.6110671936758894, "grad_norm": 2.023850917816162, "learning_rate": 1.3491129940049906e-06, "loss": 4.0436, "step": 2038 }, { "epoch": 1.6118577075098814, "grad_norm": 2.44380784034729, "learning_rate": 1.3478651131966302e-06, "loss": 3.902, "step": 2039 }, { "epoch": 1.6126482213438735, "grad_norm": 2.295586585998535, "learning_rate": 1.3466173387648804e-06, "loss": 4.3584, "step": 2040 }, { "epoch": 1.6134387351778656, "grad_norm": 2.54902982711792, "learning_rate": 1.3453696715822166e-06, "loss": 3.5078, "step": 2041 }, { "epoch": 1.6142292490118577, "grad_norm": 2.0047659873962402, "learning_rate": 1.344122112521039e-06, "loss": 4.1025, "step": 2042 }, { "epoch": 1.6150197628458498, "grad_norm": 2.3203492164611816, "learning_rate": 1.342874662453673e-06, "loss": 3.614, "step": 2043 }, { "epoch": 1.615810276679842, "grad_norm": 1.9825453758239746, "learning_rate": 1.3416273222523688e-06, "loss": 4.1905, "step": 2044 }, { "epoch": 1.616600790513834, "grad_norm": 2.5642426013946533, "learning_rate": 1.3403800927892965e-06, "loss": 4.1957, "step": 2045 }, { "epoch": 1.617391304347826, "grad_norm": 2.087780714035034, "learning_rate": 1.3391329749365527e-06, "loss": 3.9414, "step": 2046 }, { "epoch": 1.6181818181818182, "grad_norm": 2.338264226913452, "learning_rate": 1.3378859695661536e-06, "loss": 3.9941, "step": 2047 }, { "epoch": 1.6189723320158103, "grad_norm": 2.602449655532837, "learning_rate": 1.336639077550037e-06, "loss": 3.7164, "step": 2048 }, { "epoch": 1.6197628458498023, "grad_norm": 2.314422130584717, "learning_rate": 1.335392299760062e-06, "loss": 4.1207, "step": 2049 }, { "epoch": 1.6205533596837944, "grad_norm": 2.1851751804351807, "learning_rate": 1.3341456370680078e-06, "loss": 3.8891, "step": 2050 }, { "epoch": 1.6213438735177865, "grad_norm": 2.5063135623931885, "learning_rate": 1.332899090345572e-06, "loss": 4.1169, "step": 2051 }, { "epoch": 1.6221343873517786, "grad_norm": 2.174067735671997, "learning_rate": 1.331652660464373e-06, "loss": 3.9046, "step": 2052 }, { "epoch": 1.6229249011857707, "grad_norm": 2.1440975666046143, "learning_rate": 1.3304063482959463e-06, "loss": 4.1194, "step": 2053 }, { "epoch": 1.6237154150197628, "grad_norm": 2.318582773208618, "learning_rate": 1.329160154711745e-06, "loss": 4.2107, "step": 2054 }, { "epoch": 1.6245059288537549, "grad_norm": 2.1245152950286865, "learning_rate": 1.3279140805831395e-06, "loss": 4.0019, "step": 2055 }, { "epoch": 1.625296442687747, "grad_norm": 2.104807138442993, "learning_rate": 1.3266681267814173e-06, "loss": 4.0019, "step": 2056 }, { "epoch": 1.626086956521739, "grad_norm": 2.3065381050109863, "learning_rate": 1.3254222941777804e-06, "loss": 4.0043, "step": 2057 }, { "epoch": 1.6268774703557312, "grad_norm": 2.647186517715454, "learning_rate": 1.3241765836433474e-06, "loss": 3.2821, "step": 2058 }, { "epoch": 1.6276679841897232, "grad_norm": 2.3540151119232178, "learning_rate": 1.3229309960491513e-06, "loss": 3.7684, "step": 2059 }, { "epoch": 1.6284584980237153, "grad_norm": 1.9531972408294678, "learning_rate": 1.321685532266138e-06, "loss": 4.2868, "step": 2060 }, { "epoch": 1.6292490118577074, "grad_norm": 2.279486894607544, "learning_rate": 1.3204401931651686e-06, "loss": 3.8456, "step": 2061 }, { "epoch": 1.6300395256916995, "grad_norm": 2.2543559074401855, "learning_rate": 1.3191949796170155e-06, "loss": 3.6091, "step": 2062 }, { "epoch": 1.6308300395256916, "grad_norm": 2.038205862045288, "learning_rate": 1.3179498924923642e-06, "loss": 4.0222, "step": 2063 }, { "epoch": 1.6316205533596837, "grad_norm": 2.229465961456299, "learning_rate": 1.316704932661811e-06, "loss": 3.7638, "step": 2064 }, { "epoch": 1.6324110671936758, "grad_norm": 2.565423011779785, "learning_rate": 1.3154601009958645e-06, "loss": 3.6615, "step": 2065 }, { "epoch": 1.6332015810276679, "grad_norm": 2.1319541931152344, "learning_rate": 1.3142153983649422e-06, "loss": 4.1257, "step": 2066 }, { "epoch": 1.63399209486166, "grad_norm": 2.1490397453308105, "learning_rate": 1.3129708256393726e-06, "loss": 3.8046, "step": 2067 }, { "epoch": 1.634782608695652, "grad_norm": 1.9972925186157227, "learning_rate": 1.3117263836893927e-06, "loss": 4.3449, "step": 2068 }, { "epoch": 1.6355731225296442, "grad_norm": 3.498967409133911, "learning_rate": 1.3104820733851476e-06, "loss": 3.8737, "step": 2069 }, { "epoch": 1.6363636363636362, "grad_norm": 2.159128189086914, "learning_rate": 1.309237895596692e-06, "loss": 4.0966, "step": 2070 }, { "epoch": 1.6371541501976283, "grad_norm": 2.27400279045105, "learning_rate": 1.3079938511939858e-06, "loss": 3.6722, "step": 2071 }, { "epoch": 1.6379446640316204, "grad_norm": 1.933083415031433, "learning_rate": 1.306749941046897e-06, "loss": 4.1502, "step": 2072 }, { "epoch": 1.6387351778656125, "grad_norm": 2.2816059589385986, "learning_rate": 1.3055061660252e-06, "loss": 3.9241, "step": 2073 }, { "epoch": 1.6395256916996046, "grad_norm": 2.2931854724884033, "learning_rate": 1.3042625269985737e-06, "loss": 4.0306, "step": 2074 }, { "epoch": 1.6403162055335967, "grad_norm": 2.151732921600342, "learning_rate": 1.3030190248366024e-06, "loss": 4.0283, "step": 2075 }, { "epoch": 1.6411067193675888, "grad_norm": 2.6141841411590576, "learning_rate": 1.3017756604087751e-06, "loss": 3.9041, "step": 2076 }, { "epoch": 1.6418972332015809, "grad_norm": 2.076444149017334, "learning_rate": 1.3005324345844832e-06, "loss": 4.4619, "step": 2077 }, { "epoch": 1.642687747035573, "grad_norm": 1.9721205234527588, "learning_rate": 1.2992893482330225e-06, "loss": 4.1479, "step": 2078 }, { "epoch": 1.643478260869565, "grad_norm": 2.2438552379608154, "learning_rate": 1.2980464022235912e-06, "loss": 4.0011, "step": 2079 }, { "epoch": 1.6442687747035574, "grad_norm": 2.761096954345703, "learning_rate": 1.2968035974252883e-06, "loss": 3.8639, "step": 2080 }, { "epoch": 1.6450592885375495, "grad_norm": 2.099247932434082, "learning_rate": 1.2955609347071153e-06, "loss": 3.9503, "step": 2081 }, { "epoch": 1.6458498023715415, "grad_norm": 2.3637139797210693, "learning_rate": 1.2943184149379735e-06, "loss": 4.0117, "step": 2082 }, { "epoch": 1.6466403162055336, "grad_norm": 2.319124698638916, "learning_rate": 1.2930760389866647e-06, "loss": 3.5481, "step": 2083 }, { "epoch": 1.6474308300395257, "grad_norm": 1.995736002922058, "learning_rate": 1.2918338077218893e-06, "loss": 4.3576, "step": 2084 }, { "epoch": 1.6482213438735178, "grad_norm": 2.263591766357422, "learning_rate": 1.290591722012249e-06, "loss": 4.0875, "step": 2085 }, { "epoch": 1.64901185770751, "grad_norm": 2.352844715118408, "learning_rate": 1.2893497827262397e-06, "loss": 3.8303, "step": 2086 }, { "epoch": 1.649802371541502, "grad_norm": 2.168248176574707, "learning_rate": 1.2881079907322587e-06, "loss": 3.9568, "step": 2087 }, { "epoch": 1.650592885375494, "grad_norm": 2.4959282875061035, "learning_rate": 1.2868663468985986e-06, "loss": 3.7512, "step": 2088 }, { "epoch": 1.6513833992094862, "grad_norm": 2.165734052658081, "learning_rate": 1.285624852093448e-06, "loss": 3.9648, "step": 2089 }, { "epoch": 1.6521739130434783, "grad_norm": 2.130211353302002, "learning_rate": 1.2843835071848918e-06, "loss": 4.219, "step": 2090 }, { "epoch": 1.6529644268774704, "grad_norm": 2.191737413406372, "learning_rate": 1.2831423130409114e-06, "loss": 3.8074, "step": 2091 }, { "epoch": 1.6537549407114625, "grad_norm": 2.2483067512512207, "learning_rate": 1.2819012705293798e-06, "loss": 4.0339, "step": 2092 }, { "epoch": 1.6545454545454545, "grad_norm": 2.2469780445098877, "learning_rate": 1.2806603805180669e-06, "loss": 3.9683, "step": 2093 }, { "epoch": 1.6553359683794466, "grad_norm": 1.959858775138855, "learning_rate": 1.2794196438746346e-06, "loss": 4.1763, "step": 2094 }, { "epoch": 1.6561264822134387, "grad_norm": 2.3986165523529053, "learning_rate": 1.2781790614666375e-06, "loss": 3.6473, "step": 2095 }, { "epoch": 1.6569169960474308, "grad_norm": 2.641336679458618, "learning_rate": 1.2769386341615222e-06, "loss": 4.0907, "step": 2096 }, { "epoch": 1.657707509881423, "grad_norm": 2.004929780960083, "learning_rate": 1.2756983628266289e-06, "loss": 4.3396, "step": 2097 }, { "epoch": 1.658498023715415, "grad_norm": 2.312922239303589, "learning_rate": 1.274458248329185e-06, "loss": 4.0967, "step": 2098 }, { "epoch": 1.659288537549407, "grad_norm": 2.019171714782715, "learning_rate": 1.273218291536312e-06, "loss": 4.237, "step": 2099 }, { "epoch": 1.6600790513833992, "grad_norm": 2.2820329666137695, "learning_rate": 1.2719784933150176e-06, "loss": 4.1224, "step": 2100 }, { "epoch": 1.6608695652173913, "grad_norm": 2.365535259246826, "learning_rate": 1.2707388545322023e-06, "loss": 3.6844, "step": 2101 }, { "epoch": 1.6616600790513834, "grad_norm": 2.1276161670684814, "learning_rate": 1.2694993760546526e-06, "loss": 4.287, "step": 2102 }, { "epoch": 1.6624505928853754, "grad_norm": 2.106891632080078, "learning_rate": 1.2682600587490433e-06, "loss": 4.2653, "step": 2103 }, { "epoch": 1.6632411067193675, "grad_norm": 2.2501087188720703, "learning_rate": 1.2670209034819368e-06, "loss": 3.5656, "step": 2104 }, { "epoch": 1.6640316205533598, "grad_norm": 2.399806022644043, "learning_rate": 1.2657819111197835e-06, "loss": 3.6698, "step": 2105 }, { "epoch": 1.664822134387352, "grad_norm": 2.164541244506836, "learning_rate": 1.2645430825289165e-06, "loss": 3.9456, "step": 2106 }, { "epoch": 1.665612648221344, "grad_norm": 2.0978620052337646, "learning_rate": 1.2633044185755578e-06, "loss": 4.1118, "step": 2107 }, { "epoch": 1.6664031620553361, "grad_norm": 2.144617795944214, "learning_rate": 1.2620659201258134e-06, "loss": 4.3466, "step": 2108 }, { "epoch": 1.6671936758893282, "grad_norm": 2.3861491680145264, "learning_rate": 1.260827588045672e-06, "loss": 4.0441, "step": 2109 }, { "epoch": 1.6679841897233203, "grad_norm": 2.055680751800537, "learning_rate": 1.2595894232010073e-06, "loss": 4.0969, "step": 2110 }, { "epoch": 1.6687747035573124, "grad_norm": 2.217392921447754, "learning_rate": 1.2583514264575768e-06, "loss": 3.8847, "step": 2111 }, { "epoch": 1.6695652173913045, "grad_norm": 2.436622381210327, "learning_rate": 1.257113598681018e-06, "loss": 3.9242, "step": 2112 }, { "epoch": 1.6703557312252966, "grad_norm": 2.374943971633911, "learning_rate": 1.2558759407368531e-06, "loss": 3.9387, "step": 2113 }, { "epoch": 1.6711462450592887, "grad_norm": 2.0591959953308105, "learning_rate": 1.2546384534904845e-06, "loss": 3.9877, "step": 2114 }, { "epoch": 1.6719367588932808, "grad_norm": 2.1136298179626465, "learning_rate": 1.2534011378071935e-06, "loss": 4.2698, "step": 2115 }, { "epoch": 1.6727272727272728, "grad_norm": 2.130025625228882, "learning_rate": 1.2521639945521438e-06, "loss": 4.0686, "step": 2116 }, { "epoch": 1.673517786561265, "grad_norm": 2.050135850906372, "learning_rate": 1.250927024590378e-06, "loss": 3.8798, "step": 2117 }, { "epoch": 1.674308300395257, "grad_norm": 2.2440688610076904, "learning_rate": 1.2496902287868163e-06, "loss": 4.136, "step": 2118 }, { "epoch": 1.6750988142292491, "grad_norm": 2.247800588607788, "learning_rate": 1.2484536080062584e-06, "loss": 4.192, "step": 2119 }, { "epoch": 1.6758893280632412, "grad_norm": 2.148385524749756, "learning_rate": 1.2472171631133817e-06, "loss": 3.722, "step": 2120 }, { "epoch": 1.6766798418972333, "grad_norm": 1.9905602931976318, "learning_rate": 1.2459808949727396e-06, "loss": 4.3066, "step": 2121 }, { "epoch": 1.6774703557312254, "grad_norm": 2.684671401977539, "learning_rate": 1.2447448044487625e-06, "loss": 3.6803, "step": 2122 }, { "epoch": 1.6782608695652175, "grad_norm": 2.034386396408081, "learning_rate": 1.2435088924057568e-06, "loss": 4.0852, "step": 2123 }, { "epoch": 1.6790513833992096, "grad_norm": 3.147953510284424, "learning_rate": 1.2422731597079036e-06, "loss": 3.9198, "step": 2124 }, { "epoch": 1.6798418972332017, "grad_norm": 2.353424549102783, "learning_rate": 1.2410376072192585e-06, "loss": 4.0343, "step": 2125 }, { "epoch": 1.6806324110671937, "grad_norm": 2.1719071865081787, "learning_rate": 1.2398022358037525e-06, "loss": 4.0471, "step": 2126 }, { "epoch": 1.6814229249011858, "grad_norm": 2.5250072479248047, "learning_rate": 1.2385670463251882e-06, "loss": 3.9475, "step": 2127 }, { "epoch": 1.682213438735178, "grad_norm": 2.0561068058013916, "learning_rate": 1.237332039647242e-06, "loss": 4.2666, "step": 2128 }, { "epoch": 1.68300395256917, "grad_norm": 2.353351593017578, "learning_rate": 1.2360972166334618e-06, "loss": 3.6436, "step": 2129 }, { "epoch": 1.683794466403162, "grad_norm": 2.3373899459838867, "learning_rate": 1.234862578147268e-06, "loss": 3.6347, "step": 2130 }, { "epoch": 1.6845849802371542, "grad_norm": 2.229119062423706, "learning_rate": 1.2336281250519513e-06, "loss": 4.0309, "step": 2131 }, { "epoch": 1.6853754940711463, "grad_norm": 2.4911553859710693, "learning_rate": 1.2323938582106723e-06, "loss": 3.7722, "step": 2132 }, { "epoch": 1.6861660079051384, "grad_norm": 2.1356616020202637, "learning_rate": 1.2311597784864626e-06, "loss": 4.1162, "step": 2133 }, { "epoch": 1.6869565217391305, "grad_norm": 2.18538761138916, "learning_rate": 1.2299258867422227e-06, "loss": 3.9408, "step": 2134 }, { "epoch": 1.6877470355731226, "grad_norm": 2.160471200942993, "learning_rate": 1.2286921838407208e-06, "loss": 3.8688, "step": 2135 }, { "epoch": 1.6885375494071146, "grad_norm": 2.109384298324585, "learning_rate": 1.2274586706445934e-06, "loss": 4.2024, "step": 2136 }, { "epoch": 1.6893280632411067, "grad_norm": 2.2164478302001953, "learning_rate": 1.2262253480163452e-06, "loss": 4.0851, "step": 2137 }, { "epoch": 1.6901185770750988, "grad_norm": 2.0164434909820557, "learning_rate": 1.2249922168183463e-06, "loss": 4.1713, "step": 2138 }, { "epoch": 1.690909090909091, "grad_norm": 2.249847888946533, "learning_rate": 1.2237592779128334e-06, "loss": 4.102, "step": 2139 }, { "epoch": 1.691699604743083, "grad_norm": 2.358579635620117, "learning_rate": 1.2225265321619108e-06, "loss": 3.5956, "step": 2140 }, { "epoch": 1.692490118577075, "grad_norm": 2.0965378284454346, "learning_rate": 1.2212939804275433e-06, "loss": 4.0416, "step": 2141 }, { "epoch": 1.6932806324110672, "grad_norm": 2.2560956478118896, "learning_rate": 1.220061623571564e-06, "loss": 4.0001, "step": 2142 }, { "epoch": 1.6940711462450593, "grad_norm": 2.388387680053711, "learning_rate": 1.2188294624556692e-06, "loss": 3.6649, "step": 2143 }, { "epoch": 1.6948616600790514, "grad_norm": 2.601963520050049, "learning_rate": 1.2175974979414158e-06, "loss": 3.8577, "step": 2144 }, { "epoch": 1.6956521739130435, "grad_norm": 2.2142322063446045, "learning_rate": 1.2163657308902256e-06, "loss": 4.0043, "step": 2145 }, { "epoch": 1.6964426877470355, "grad_norm": 1.9986302852630615, "learning_rate": 1.2151341621633828e-06, "loss": 4.2957, "step": 2146 }, { "epoch": 1.6972332015810276, "grad_norm": 2.2367138862609863, "learning_rate": 1.2139027926220298e-06, "loss": 3.9495, "step": 2147 }, { "epoch": 1.6980237154150197, "grad_norm": 2.1270484924316406, "learning_rate": 1.212671623127173e-06, "loss": 4.0942, "step": 2148 }, { "epoch": 1.6988142292490118, "grad_norm": 2.5075926780700684, "learning_rate": 1.2114406545396778e-06, "loss": 3.8039, "step": 2149 }, { "epoch": 1.699604743083004, "grad_norm": 2.8341033458709717, "learning_rate": 1.2102098877202682e-06, "loss": 3.6956, "step": 2150 }, { "epoch": 1.700395256916996, "grad_norm": 1.9946744441986084, "learning_rate": 1.208979323529528e-06, "loss": 3.9966, "step": 2151 }, { "epoch": 1.701185770750988, "grad_norm": 2.2164306640625, "learning_rate": 1.2077489628279004e-06, "loss": 4.1072, "step": 2152 }, { "epoch": 1.7019762845849802, "grad_norm": 2.0629587173461914, "learning_rate": 1.2065188064756832e-06, "loss": 4.1542, "step": 2153 }, { "epoch": 1.7027667984189723, "grad_norm": 2.120542287826538, "learning_rate": 1.2052888553330343e-06, "loss": 4.2954, "step": 2154 }, { "epoch": 1.7035573122529644, "grad_norm": 2.2842013835906982, "learning_rate": 1.2040591102599675e-06, "loss": 3.922, "step": 2155 }, { "epoch": 1.7043478260869565, "grad_norm": 2.4500908851623535, "learning_rate": 1.202829572116351e-06, "loss": 3.9162, "step": 2156 }, { "epoch": 1.7051383399209485, "grad_norm": 2.373321056365967, "learning_rate": 1.2016002417619101e-06, "loss": 3.9529, "step": 2157 }, { "epoch": 1.7059288537549406, "grad_norm": 2.223165512084961, "learning_rate": 1.2003711200562244e-06, "loss": 3.9709, "step": 2158 }, { "epoch": 1.7067193675889327, "grad_norm": 2.493049144744873, "learning_rate": 1.1991422078587256e-06, "loss": 3.6104, "step": 2159 }, { "epoch": 1.7075098814229248, "grad_norm": 1.9324822425842285, "learning_rate": 1.1979135060287028e-06, "loss": 4.303, "step": 2160 }, { "epoch": 1.708300395256917, "grad_norm": 2.270420551300049, "learning_rate": 1.1966850154252938e-06, "loss": 3.6976, "step": 2161 }, { "epoch": 1.709090909090909, "grad_norm": 2.0898854732513428, "learning_rate": 1.195456736907492e-06, "loss": 4.3058, "step": 2162 }, { "epoch": 1.709881422924901, "grad_norm": 2.3541245460510254, "learning_rate": 1.194228671334141e-06, "loss": 3.8309, "step": 2163 }, { "epoch": 1.7106719367588932, "grad_norm": 2.2674691677093506, "learning_rate": 1.1930008195639352e-06, "loss": 4.127, "step": 2164 }, { "epoch": 1.7114624505928853, "grad_norm": 2.2067158222198486, "learning_rate": 1.19177318245542e-06, "loss": 4.3073, "step": 2165 }, { "epoch": 1.7122529644268774, "grad_norm": 2.5221574306488037, "learning_rate": 1.1905457608669923e-06, "loss": 3.8359, "step": 2166 }, { "epoch": 1.7130434782608694, "grad_norm": 2.5137407779693604, "learning_rate": 1.189318555656894e-06, "loss": 4.1115, "step": 2167 }, { "epoch": 1.7138339920948615, "grad_norm": 1.9908279180526733, "learning_rate": 1.1880915676832205e-06, "loss": 4.2637, "step": 2168 }, { "epoch": 1.7146245059288536, "grad_norm": 2.196354627609253, "learning_rate": 1.1868647978039127e-06, "loss": 3.9265, "step": 2169 }, { "epoch": 1.7154150197628457, "grad_norm": 2.06400990486145, "learning_rate": 1.1856382468767593e-06, "loss": 4.0544, "step": 2170 }, { "epoch": 1.7162055335968378, "grad_norm": 2.584239959716797, "learning_rate": 1.1844119157593961e-06, "loss": 3.9168, "step": 2171 }, { "epoch": 1.71699604743083, "grad_norm": 2.479269504547119, "learning_rate": 1.1831858053093058e-06, "loss": 3.7989, "step": 2172 }, { "epoch": 1.717786561264822, "grad_norm": 2.4407100677490234, "learning_rate": 1.1819599163838151e-06, "loss": 3.9847, "step": 2173 }, { "epoch": 1.718577075098814, "grad_norm": 2.2352006435394287, "learning_rate": 1.180734249840098e-06, "loss": 3.8451, "step": 2174 }, { "epoch": 1.7193675889328062, "grad_norm": 2.160641670227051, "learning_rate": 1.179508806535172e-06, "loss": 3.9283, "step": 2175 }, { "epoch": 1.7201581027667983, "grad_norm": 2.1610970497131348, "learning_rate": 1.1782835873258977e-06, "loss": 4.295, "step": 2176 }, { "epoch": 1.7209486166007903, "grad_norm": 2.091292142868042, "learning_rate": 1.17705859306898e-06, "loss": 4.0003, "step": 2177 }, { "epoch": 1.7217391304347827, "grad_norm": 2.062894344329834, "learning_rate": 1.1758338246209665e-06, "loss": 3.9293, "step": 2178 }, { "epoch": 1.7225296442687748, "grad_norm": 2.3220198154449463, "learning_rate": 1.1746092828382466e-06, "loss": 3.8438, "step": 2179 }, { "epoch": 1.7233201581027668, "grad_norm": 2.4248337745666504, "learning_rate": 1.1733849685770507e-06, "loss": 3.7864, "step": 2180 }, { "epoch": 1.724110671936759, "grad_norm": 2.178452491760254, "learning_rate": 1.1721608826934516e-06, "loss": 4.0929, "step": 2181 }, { "epoch": 1.724901185770751, "grad_norm": 2.226580858230591, "learning_rate": 1.1709370260433612e-06, "loss": 4.0871, "step": 2182 }, { "epoch": 1.7256916996047431, "grad_norm": 2.3278186321258545, "learning_rate": 1.1697133994825312e-06, "loss": 4.0792, "step": 2183 }, { "epoch": 1.7264822134387352, "grad_norm": 2.2541162967681885, "learning_rate": 1.168490003866553e-06, "loss": 4.0901, "step": 2184 }, { "epoch": 1.7272727272727273, "grad_norm": 2.28283953666687, "learning_rate": 1.1672668400508557e-06, "loss": 4.0911, "step": 2185 }, { "epoch": 1.7280632411067194, "grad_norm": 2.1893603801727295, "learning_rate": 1.1660439088907066e-06, "loss": 4.3318, "step": 2186 }, { "epoch": 1.7288537549407115, "grad_norm": 2.2862863540649414, "learning_rate": 1.164821211241212e-06, "loss": 3.7928, "step": 2187 }, { "epoch": 1.7296442687747036, "grad_norm": 2.369239568710327, "learning_rate": 1.163598747957311e-06, "loss": 3.5198, "step": 2188 }, { "epoch": 1.7304347826086957, "grad_norm": 2.409519672393799, "learning_rate": 1.1623765198937836e-06, "loss": 3.7418, "step": 2189 }, { "epoch": 1.7312252964426877, "grad_norm": 2.97839093208313, "learning_rate": 1.1611545279052414e-06, "loss": 4.0091, "step": 2190 }, { "epoch": 1.7320158102766798, "grad_norm": 2.65765380859375, "learning_rate": 1.159932772846133e-06, "loss": 4.0166, "step": 2191 }, { "epoch": 1.732806324110672, "grad_norm": 2.09125018119812, "learning_rate": 1.158711255570741e-06, "loss": 4.033, "step": 2192 }, { "epoch": 1.733596837944664, "grad_norm": 2.0903429985046387, "learning_rate": 1.1574899769331813e-06, "loss": 4.2326, "step": 2193 }, { "epoch": 1.734387351778656, "grad_norm": 2.207402229309082, "learning_rate": 1.1562689377874028e-06, "loss": 4.1858, "step": 2194 }, { "epoch": 1.7351778656126482, "grad_norm": 2.2563931941986084, "learning_rate": 1.1550481389871888e-06, "loss": 3.7337, "step": 2195 }, { "epoch": 1.7359683794466403, "grad_norm": 2.113767147064209, "learning_rate": 1.1538275813861516e-06, "loss": 4.135, "step": 2196 }, { "epoch": 1.7367588932806324, "grad_norm": 2.250952959060669, "learning_rate": 1.1526072658377372e-06, "loss": 3.7092, "step": 2197 }, { "epoch": 1.7375494071146245, "grad_norm": 2.158241033554077, "learning_rate": 1.1513871931952213e-06, "loss": 3.9925, "step": 2198 }, { "epoch": 1.7383399209486166, "grad_norm": 3.0094869136810303, "learning_rate": 1.1501673643117094e-06, "loss": 4.0085, "step": 2199 }, { "epoch": 1.7391304347826086, "grad_norm": 2.228912353515625, "learning_rate": 1.1489477800401375e-06, "loss": 3.7764, "step": 2200 }, { "epoch": 1.7399209486166007, "grad_norm": 2.2560195922851562, "learning_rate": 1.147728441233271e-06, "loss": 3.5726, "step": 2201 }, { "epoch": 1.740711462450593, "grad_norm": 2.857039213180542, "learning_rate": 1.1465093487437013e-06, "loss": 3.4797, "step": 2202 }, { "epoch": 1.7415019762845851, "grad_norm": 2.645209789276123, "learning_rate": 1.1452905034238496e-06, "loss": 3.5743, "step": 2203 }, { "epoch": 1.7422924901185772, "grad_norm": 2.08221697807312, "learning_rate": 1.1440719061259647e-06, "loss": 4.2309, "step": 2204 }, { "epoch": 1.7430830039525693, "grad_norm": 2.482703447341919, "learning_rate": 1.1428535577021199e-06, "loss": 3.8563, "step": 2205 }, { "epoch": 1.7438735177865614, "grad_norm": 2.3771986961364746, "learning_rate": 1.141635459004216e-06, "loss": 3.987, "step": 2206 }, { "epoch": 1.7446640316205535, "grad_norm": 2.2881948947906494, "learning_rate": 1.1404176108839795e-06, "loss": 3.807, "step": 2207 }, { "epoch": 1.7454545454545456, "grad_norm": 2.0664234161376953, "learning_rate": 1.1392000141929597e-06, "loss": 4.3907, "step": 2208 }, { "epoch": 1.7462450592885377, "grad_norm": 1.928506851196289, "learning_rate": 1.1379826697825323e-06, "loss": 4.1777, "step": 2209 }, { "epoch": 1.7470355731225298, "grad_norm": 2.4899723529815674, "learning_rate": 1.1367655785038958e-06, "loss": 3.8784, "step": 2210 }, { "epoch": 1.7478260869565219, "grad_norm": 2.594074249267578, "learning_rate": 1.135548741208071e-06, "loss": 4.0069, "step": 2211 }, { "epoch": 1.748616600790514, "grad_norm": 2.2876784801483154, "learning_rate": 1.1343321587459015e-06, "loss": 3.5173, "step": 2212 }, { "epoch": 1.749407114624506, "grad_norm": 2.4012258052825928, "learning_rate": 1.1331158319680546e-06, "loss": 3.7026, "step": 2213 }, { "epoch": 1.7501976284584981, "grad_norm": 2.046375274658203, "learning_rate": 1.1318997617250149e-06, "loss": 4.0928, "step": 2214 }, { "epoch": 1.7509881422924902, "grad_norm": 2.119033098220825, "learning_rate": 1.130683948867091e-06, "loss": 3.9027, "step": 2215 }, { "epoch": 1.7517786561264823, "grad_norm": 2.3774054050445557, "learning_rate": 1.1294683942444108e-06, "loss": 4.323, "step": 2216 }, { "epoch": 1.7525691699604744, "grad_norm": 2.32057785987854, "learning_rate": 1.1282530987069203e-06, "loss": 3.507, "step": 2217 }, { "epoch": 1.7533596837944665, "grad_norm": 2.069509267807007, "learning_rate": 1.1270380631043859e-06, "loss": 4.3224, "step": 2218 }, { "epoch": 1.7541501976284586, "grad_norm": 2.364652395248413, "learning_rate": 1.1258232882863917e-06, "loss": 3.8523, "step": 2219 }, { "epoch": 1.7549407114624507, "grad_norm": 2.3364572525024414, "learning_rate": 1.1246087751023383e-06, "loss": 3.9394, "step": 2220 }, { "epoch": 1.7557312252964428, "grad_norm": 2.2890138626098633, "learning_rate": 1.1233945244014461e-06, "loss": 4.0481, "step": 2221 }, { "epoch": 1.7565217391304349, "grad_norm": 2.2992982864379883, "learning_rate": 1.1221805370327487e-06, "loss": 3.3286, "step": 2222 }, { "epoch": 1.757312252964427, "grad_norm": 2.132540464401245, "learning_rate": 1.120966813845098e-06, "loss": 4.3313, "step": 2223 }, { "epoch": 1.758102766798419, "grad_norm": 2.374795913696289, "learning_rate": 1.1197533556871605e-06, "loss": 4.1754, "step": 2224 }, { "epoch": 1.7588932806324111, "grad_norm": 2.0399794578552246, "learning_rate": 1.118540163407417e-06, "loss": 4.0845, "step": 2225 }, { "epoch": 1.7596837944664032, "grad_norm": 1.9116907119750977, "learning_rate": 1.117327237854162e-06, "loss": 4.4521, "step": 2226 }, { "epoch": 1.7604743083003953, "grad_norm": 2.347005844116211, "learning_rate": 1.1161145798755054e-06, "loss": 3.7511, "step": 2227 }, { "epoch": 1.7612648221343874, "grad_norm": 2.208932399749756, "learning_rate": 1.1149021903193674e-06, "loss": 3.9533, "step": 2228 }, { "epoch": 1.7620553359683795, "grad_norm": 2.1732113361358643, "learning_rate": 1.1136900700334826e-06, "loss": 4.0074, "step": 2229 }, { "epoch": 1.7628458498023716, "grad_norm": 2.2012805938720703, "learning_rate": 1.1124782198653966e-06, "loss": 4.1806, "step": 2230 }, { "epoch": 1.7636363636363637, "grad_norm": 1.956260085105896, "learning_rate": 1.1112666406624655e-06, "loss": 4.5015, "step": 2231 }, { "epoch": 1.7644268774703558, "grad_norm": 2.12349796295166, "learning_rate": 1.1100553332718572e-06, "loss": 4.3334, "step": 2232 }, { "epoch": 1.7652173913043478, "grad_norm": 2.4130358695983887, "learning_rate": 1.1088442985405485e-06, "loss": 3.7909, "step": 2233 }, { "epoch": 1.76600790513834, "grad_norm": 2.089066505432129, "learning_rate": 1.1076335373153256e-06, "loss": 4.2759, "step": 2234 }, { "epoch": 1.766798418972332, "grad_norm": 2.228564977645874, "learning_rate": 1.1064230504427844e-06, "loss": 4.1312, "step": 2235 }, { "epoch": 1.7675889328063241, "grad_norm": 2.206040859222412, "learning_rate": 1.1052128387693283e-06, "loss": 4.0965, "step": 2236 }, { "epoch": 1.7683794466403162, "grad_norm": 2.2134790420532227, "learning_rate": 1.1040029031411676e-06, "loss": 3.9449, "step": 2237 }, { "epoch": 1.7691699604743083, "grad_norm": 1.9254735708236694, "learning_rate": 1.102793244404321e-06, "loss": 4.3597, "step": 2238 }, { "epoch": 1.7699604743083004, "grad_norm": 2.6168124675750732, "learning_rate": 1.1015838634046126e-06, "loss": 3.8127, "step": 2239 }, { "epoch": 1.7707509881422925, "grad_norm": 2.065718650817871, "learning_rate": 1.1003747609876724e-06, "loss": 4.2161, "step": 2240 }, { "epoch": 1.7715415019762846, "grad_norm": 2.1106209754943848, "learning_rate": 1.0991659379989356e-06, "loss": 4.087, "step": 2241 }, { "epoch": 1.7723320158102767, "grad_norm": 2.443594217300415, "learning_rate": 1.0979573952836432e-06, "loss": 3.712, "step": 2242 }, { "epoch": 1.7731225296442688, "grad_norm": 2.236603021621704, "learning_rate": 1.0967491336868383e-06, "loss": 4.0317, "step": 2243 }, { "epoch": 1.7739130434782608, "grad_norm": 2.493178129196167, "learning_rate": 1.0955411540533685e-06, "loss": 4.0133, "step": 2244 }, { "epoch": 1.774703557312253, "grad_norm": 2.181206703186035, "learning_rate": 1.0943334572278847e-06, "loss": 4.0564, "step": 2245 }, { "epoch": 1.775494071146245, "grad_norm": 2.1546778678894043, "learning_rate": 1.0931260440548384e-06, "loss": 4.087, "step": 2246 }, { "epoch": 1.7762845849802371, "grad_norm": 2.7099740505218506, "learning_rate": 1.0919189153784844e-06, "loss": 3.8997, "step": 2247 }, { "epoch": 1.7770750988142292, "grad_norm": 2.262272596359253, "learning_rate": 1.0907120720428789e-06, "loss": 4.0046, "step": 2248 }, { "epoch": 1.7778656126482213, "grad_norm": 1.9858298301696777, "learning_rate": 1.0895055148918757e-06, "loss": 4.0949, "step": 2249 }, { "epoch": 1.7786561264822134, "grad_norm": 2.714948892593384, "learning_rate": 1.0882992447691322e-06, "loss": 3.8746, "step": 2250 }, { "epoch": 1.7794466403162055, "grad_norm": 2.2949204444885254, "learning_rate": 1.0870932625181026e-06, "loss": 4.0543, "step": 2251 }, { "epoch": 1.7802371541501976, "grad_norm": 2.5711617469787598, "learning_rate": 1.0858875689820403e-06, "loss": 3.6488, "step": 2252 }, { "epoch": 1.7810276679841897, "grad_norm": 2.4469809532165527, "learning_rate": 1.0846821650039982e-06, "loss": 3.7134, "step": 2253 }, { "epoch": 1.7818181818181817, "grad_norm": 2.080799102783203, "learning_rate": 1.0834770514268245e-06, "loss": 4.2275, "step": 2254 }, { "epoch": 1.7826086956521738, "grad_norm": 2.41434383392334, "learning_rate": 1.0822722290931655e-06, "loss": 3.3983, "step": 2255 }, { "epoch": 1.783399209486166, "grad_norm": 2.1476120948791504, "learning_rate": 1.0810676988454653e-06, "loss": 3.9649, "step": 2256 }, { "epoch": 1.784189723320158, "grad_norm": 2.171931266784668, "learning_rate": 1.0798634615259602e-06, "loss": 3.9138, "step": 2257 }, { "epoch": 1.78498023715415, "grad_norm": 2.3123092651367188, "learning_rate": 1.0786595179766853e-06, "loss": 3.8615, "step": 2258 }, { "epoch": 1.7857707509881422, "grad_norm": 2.037029266357422, "learning_rate": 1.0774558690394687e-06, "loss": 4.0614, "step": 2259 }, { "epoch": 1.7865612648221343, "grad_norm": 2.5823845863342285, "learning_rate": 1.0762525155559316e-06, "loss": 3.8953, "step": 2260 }, { "epoch": 1.7873517786561264, "grad_norm": 2.582339286804199, "learning_rate": 1.07504945836749e-06, "loss": 3.7035, "step": 2261 }, { "epoch": 1.7881422924901185, "grad_norm": 2.1569316387176514, "learning_rate": 1.0738466983153535e-06, "loss": 4.3088, "step": 2262 }, { "epoch": 1.7889328063241106, "grad_norm": 2.280256748199463, "learning_rate": 1.0726442362405208e-06, "loss": 3.9179, "step": 2263 }, { "epoch": 1.7897233201581026, "grad_norm": 2.1074373722076416, "learning_rate": 1.0714420729837851e-06, "loss": 3.8384, "step": 2264 }, { "epoch": 1.7905138339920947, "grad_norm": 2.253413677215576, "learning_rate": 1.0702402093857303e-06, "loss": 3.9756, "step": 2265 }, { "epoch": 1.7913043478260868, "grad_norm": 2.2149314880371094, "learning_rate": 1.0690386462867294e-06, "loss": 4.0786, "step": 2266 }, { "epoch": 1.792094861660079, "grad_norm": 2.314135789871216, "learning_rate": 1.0678373845269458e-06, "loss": 3.81, "step": 2267 }, { "epoch": 1.792885375494071, "grad_norm": 2.3503024578094482, "learning_rate": 1.0666364249463339e-06, "loss": 3.8772, "step": 2268 }, { "epoch": 1.793675889328063, "grad_norm": 2.3574059009552, "learning_rate": 1.0654357683846338e-06, "loss": 3.9717, "step": 2269 }, { "epoch": 1.7944664031620552, "grad_norm": 2.137181282043457, "learning_rate": 1.0642354156813763e-06, "loss": 4.0884, "step": 2270 }, { "epoch": 1.7952569169960473, "grad_norm": 2.4403200149536133, "learning_rate": 1.0630353676758788e-06, "loss": 3.5869, "step": 2271 }, { "epoch": 1.7960474308300394, "grad_norm": 2.3515982627868652, "learning_rate": 1.0618356252072451e-06, "loss": 4.005, "step": 2272 }, { "epoch": 1.7968379446640315, "grad_norm": 2.1678075790405273, "learning_rate": 1.060636189114366e-06, "loss": 3.8269, "step": 2273 }, { "epoch": 1.7976284584980236, "grad_norm": 2.2742767333984375, "learning_rate": 1.0594370602359183e-06, "loss": 4.1974, "step": 2274 }, { "epoch": 1.7984189723320159, "grad_norm": 2.1233556270599365, "learning_rate": 1.058238239410363e-06, "loss": 4.1926, "step": 2275 }, { "epoch": 1.799209486166008, "grad_norm": 2.2846012115478516, "learning_rate": 1.0570397274759468e-06, "loss": 3.562, "step": 2276 }, { "epoch": 1.8, "grad_norm": 2.071044921875, "learning_rate": 1.0558415252707004e-06, "loss": 4.1243, "step": 2277 }, { "epoch": 1.8007905138339921, "grad_norm": 2.833124876022339, "learning_rate": 1.0546436336324363e-06, "loss": 3.7091, "step": 2278 }, { "epoch": 1.8015810276679842, "grad_norm": 2.2969789505004883, "learning_rate": 1.053446053398752e-06, "loss": 3.9525, "step": 2279 }, { "epoch": 1.8023715415019763, "grad_norm": 2.0870888233184814, "learning_rate": 1.0522487854070255e-06, "loss": 3.9133, "step": 2280 }, { "epoch": 1.8031620553359684, "grad_norm": 2.047163963317871, "learning_rate": 1.0510518304944175e-06, "loss": 4.3971, "step": 2281 }, { "epoch": 1.8039525691699605, "grad_norm": 2.0931217670440674, "learning_rate": 1.0498551894978706e-06, "loss": 4.1735, "step": 2282 }, { "epoch": 1.8047430830039526, "grad_norm": 2.8801486492156982, "learning_rate": 1.048658863254105e-06, "loss": 3.8256, "step": 2283 }, { "epoch": 1.8055335968379447, "grad_norm": 1.941636085510254, "learning_rate": 1.0474628525996238e-06, "loss": 4.2281, "step": 2284 }, { "epoch": 1.8063241106719368, "grad_norm": 2.3794054985046387, "learning_rate": 1.0462671583707083e-06, "loss": 3.8213, "step": 2285 }, { "epoch": 1.8071146245059289, "grad_norm": 2.065281391143799, "learning_rate": 1.045071781403418e-06, "loss": 4.3552, "step": 2286 }, { "epoch": 1.807905138339921, "grad_norm": 2.266793727874756, "learning_rate": 1.0438767225335917e-06, "loss": 3.9155, "step": 2287 }, { "epoch": 1.808695652173913, "grad_norm": 2.4998106956481934, "learning_rate": 1.042681982596845e-06, "loss": 3.9446, "step": 2288 }, { "epoch": 1.8094861660079051, "grad_norm": 2.151766538619995, "learning_rate": 1.0414875624285698e-06, "loss": 4.0546, "step": 2289 }, { "epoch": 1.8102766798418972, "grad_norm": 2.2256405353546143, "learning_rate": 1.0402934628639366e-06, "loss": 3.5838, "step": 2290 }, { "epoch": 1.8110671936758893, "grad_norm": 2.084360122680664, "learning_rate": 1.0390996847378898e-06, "loss": 3.9313, "step": 2291 }, { "epoch": 1.8118577075098814, "grad_norm": 2.2175724506378174, "learning_rate": 1.0379062288851496e-06, "loss": 4.0353, "step": 2292 }, { "epoch": 1.8126482213438735, "grad_norm": 2.3131816387176514, "learning_rate": 1.0367130961402109e-06, "loss": 4.0173, "step": 2293 }, { "epoch": 1.8134387351778656, "grad_norm": 2.3757190704345703, "learning_rate": 1.035520287337343e-06, "loss": 3.6487, "step": 2294 }, { "epoch": 1.8142292490118577, "grad_norm": 2.2513349056243896, "learning_rate": 1.0343278033105879e-06, "loss": 3.9224, "step": 2295 }, { "epoch": 1.8150197628458498, "grad_norm": 2.165501832962036, "learning_rate": 1.0331356448937608e-06, "loss": 4.122, "step": 2296 }, { "epoch": 1.8158102766798419, "grad_norm": 2.158015727996826, "learning_rate": 1.0319438129204503e-06, "loss": 4.0182, "step": 2297 }, { "epoch": 1.816600790513834, "grad_norm": 2.26969575881958, "learning_rate": 1.0307523082240152e-06, "loss": 3.9998, "step": 2298 }, { "epoch": 1.8173913043478263, "grad_norm": 2.0369486808776855, "learning_rate": 1.029561131637586e-06, "loss": 4.3207, "step": 2299 }, { "epoch": 1.8181818181818183, "grad_norm": 2.070573329925537, "learning_rate": 1.0283702839940648e-06, "loss": 3.971, "step": 2300 }, { "epoch": 1.8189723320158104, "grad_norm": 2.1382224559783936, "learning_rate": 1.0271797661261215e-06, "loss": 4.07, "step": 2301 }, { "epoch": 1.8197628458498025, "grad_norm": 2.070951223373413, "learning_rate": 1.025989578866197e-06, "loss": 4.2306, "step": 2302 }, { "epoch": 1.8205533596837946, "grad_norm": 2.1040825843811035, "learning_rate": 1.024799723046502e-06, "loss": 4.0989, "step": 2303 }, { "epoch": 1.8213438735177867, "grad_norm": 2.051177501678467, "learning_rate": 1.0236101994990124e-06, "loss": 4.3297, "step": 2304 }, { "epoch": 1.8221343873517788, "grad_norm": 2.246316909790039, "learning_rate": 1.0224210090554745e-06, "loss": 4.0244, "step": 2305 }, { "epoch": 1.8229249011857709, "grad_norm": 2.2434048652648926, "learning_rate": 1.021232152547401e-06, "loss": 4.0614, "step": 2306 }, { "epoch": 1.823715415019763, "grad_norm": 2.186248779296875, "learning_rate": 1.0200436308060696e-06, "loss": 4.2582, "step": 2307 }, { "epoch": 1.824505928853755, "grad_norm": 3.455203056335449, "learning_rate": 1.0188554446625261e-06, "loss": 3.7284, "step": 2308 }, { "epoch": 1.8252964426877472, "grad_norm": 2.34033465385437, "learning_rate": 1.017667594947581e-06, "loss": 3.8215, "step": 2309 }, { "epoch": 1.8260869565217392, "grad_norm": 2.136549234390259, "learning_rate": 1.0164800824918082e-06, "loss": 3.9098, "step": 2310 }, { "epoch": 1.8268774703557313, "grad_norm": 2.218038320541382, "learning_rate": 1.0152929081255478e-06, "loss": 3.6621, "step": 2311 }, { "epoch": 1.8276679841897234, "grad_norm": 2.024277687072754, "learning_rate": 1.0141060726789017e-06, "loss": 4.1698, "step": 2312 }, { "epoch": 1.8284584980237155, "grad_norm": 2.0792415142059326, "learning_rate": 1.0129195769817361e-06, "loss": 4.1248, "step": 2313 }, { "epoch": 1.8292490118577076, "grad_norm": 2.067476987838745, "learning_rate": 1.0117334218636795e-06, "loss": 4.1114, "step": 2314 }, { "epoch": 1.8300395256916997, "grad_norm": 2.30015230178833, "learning_rate": 1.0105476081541213e-06, "loss": 3.8223, "step": 2315 }, { "epoch": 1.8308300395256918, "grad_norm": 2.284959077835083, "learning_rate": 1.0093621366822127e-06, "loss": 4.1537, "step": 2316 }, { "epoch": 1.8316205533596839, "grad_norm": 2.477113962173462, "learning_rate": 1.0081770082768669e-06, "loss": 3.4636, "step": 2317 }, { "epoch": 1.832411067193676, "grad_norm": 2.1368634700775146, "learning_rate": 1.0069922237667545e-06, "loss": 3.999, "step": 2318 }, { "epoch": 1.833201581027668, "grad_norm": 2.267850637435913, "learning_rate": 1.0058077839803082e-06, "loss": 4.1939, "step": 2319 }, { "epoch": 1.8339920948616601, "grad_norm": 2.46919846534729, "learning_rate": 1.0046236897457186e-06, "loss": 4.0692, "step": 2320 }, { "epoch": 1.8347826086956522, "grad_norm": 3.463388442993164, "learning_rate": 1.0034399418909342e-06, "loss": 3.9923, "step": 2321 }, { "epoch": 1.8355731225296443, "grad_norm": 2.62080717086792, "learning_rate": 1.0022565412436619e-06, "loss": 3.0766, "step": 2322 }, { "epoch": 1.8363636363636364, "grad_norm": 2.1765692234039307, "learning_rate": 1.0010734886313665e-06, "loss": 4.0875, "step": 2323 }, { "epoch": 1.8371541501976285, "grad_norm": 2.1018121242523193, "learning_rate": 9.998907848812677e-07, "loss": 3.8297, "step": 2324 }, { "epoch": 1.8379446640316206, "grad_norm": 2.2504653930664062, "learning_rate": 9.987084308203429e-07, "loss": 4.1901, "step": 2325 }, { "epoch": 1.8387351778656127, "grad_norm": 2.247976303100586, "learning_rate": 9.975264272753243e-07, "loss": 4.015, "step": 2326 }, { "epoch": 1.8395256916996048, "grad_norm": 2.3783445358276367, "learning_rate": 9.963447750726985e-07, "loss": 3.5944, "step": 2327 }, { "epoch": 1.8403162055335969, "grad_norm": 2.2174489498138428, "learning_rate": 9.951634750387073e-07, "loss": 3.9968, "step": 2328 }, { "epoch": 1.841106719367589, "grad_norm": 2.179391622543335, "learning_rate": 9.939825279993468e-07, "loss": 4.1431, "step": 2329 }, { "epoch": 1.841897233201581, "grad_norm": 2.315997838973999, "learning_rate": 9.92801934780364e-07, "loss": 3.8724, "step": 2330 }, { "epoch": 1.8426877470355731, "grad_norm": 2.1842000484466553, "learning_rate": 9.916216962072606e-07, "loss": 4.1467, "step": 2331 }, { "epoch": 1.8434782608695652, "grad_norm": 2.262197971343994, "learning_rate": 9.9044181310529e-07, "loss": 4.0547, "step": 2332 }, { "epoch": 1.8442687747035573, "grad_norm": 2.0702552795410156, "learning_rate": 9.892622862994562e-07, "loss": 4.355, "step": 2333 }, { "epoch": 1.8450592885375494, "grad_norm": 2.256521701812744, "learning_rate": 9.880831166145144e-07, "loss": 3.82, "step": 2334 }, { "epoch": 1.8458498023715415, "grad_norm": 2.1371164321899414, "learning_rate": 9.86904304874971e-07, "loss": 4.1047, "step": 2335 }, { "epoch": 1.8466403162055336, "grad_norm": 2.2950093746185303, "learning_rate": 9.857258519050801e-07, "loss": 4.2336, "step": 2336 }, { "epoch": 1.8474308300395257, "grad_norm": 2.074692964553833, "learning_rate": 9.845477585288474e-07, "loss": 4.092, "step": 2337 }, { "epoch": 1.8482213438735178, "grad_norm": 1.9837701320648193, "learning_rate": 9.833700255700258e-07, "loss": 4.2782, "step": 2338 }, { "epoch": 1.8490118577075099, "grad_norm": 2.1331570148468018, "learning_rate": 9.821926538521158e-07, "loss": 3.9775, "step": 2339 }, { "epoch": 1.849802371541502, "grad_norm": 2.0978214740753174, "learning_rate": 9.810156441983667e-07, "loss": 4.1333, "step": 2340 }, { "epoch": 1.850592885375494, "grad_norm": 2.4908676147460938, "learning_rate": 9.798389974317725e-07, "loss": 3.8909, "step": 2341 }, { "epoch": 1.8513833992094861, "grad_norm": 2.1791539192199707, "learning_rate": 9.786627143750758e-07, "loss": 4.0493, "step": 2342 }, { "epoch": 1.8521739130434782, "grad_norm": 2.259037494659424, "learning_rate": 9.77486795850764e-07, "loss": 3.7205, "step": 2343 }, { "epoch": 1.8529644268774703, "grad_norm": 2.1109044551849365, "learning_rate": 9.763112426810683e-07, "loss": 4.128, "step": 2344 }, { "epoch": 1.8537549407114624, "grad_norm": 2.6669089794158936, "learning_rate": 9.751360556879665e-07, "loss": 3.5651, "step": 2345 }, { "epoch": 1.8545454545454545, "grad_norm": 2.2195193767547607, "learning_rate": 9.739612356931796e-07, "loss": 4.3402, "step": 2346 }, { "epoch": 1.8553359683794466, "grad_norm": 2.197763681411743, "learning_rate": 9.727867835181712e-07, "loss": 4.1308, "step": 2347 }, { "epoch": 1.8561264822134387, "grad_norm": 2.068894386291504, "learning_rate": 9.716126999841487e-07, "loss": 4.2798, "step": 2348 }, { "epoch": 1.8569169960474308, "grad_norm": 2.2263174057006836, "learning_rate": 9.704389859120613e-07, "loss": 3.7546, "step": 2349 }, { "epoch": 1.8577075098814229, "grad_norm": 2.3578126430511475, "learning_rate": 9.692656421225996e-07, "loss": 3.9983, "step": 2350 }, { "epoch": 1.858498023715415, "grad_norm": 3.200679302215576, "learning_rate": 9.680926694361964e-07, "loss": 3.7652, "step": 2351 }, { "epoch": 1.859288537549407, "grad_norm": 2.3472776412963867, "learning_rate": 9.669200686730242e-07, "loss": 3.7219, "step": 2352 }, { "epoch": 1.8600790513833991, "grad_norm": 2.4546115398406982, "learning_rate": 9.657478406529947e-07, "loss": 3.8638, "step": 2353 }, { "epoch": 1.8608695652173912, "grad_norm": 3.374516248703003, "learning_rate": 9.645759861957603e-07, "loss": 4.0503, "step": 2354 }, { "epoch": 1.8616600790513833, "grad_norm": 2.1684820652008057, "learning_rate": 9.634045061207117e-07, "loss": 3.9519, "step": 2355 }, { "epoch": 1.8624505928853754, "grad_norm": 2.3118953704833984, "learning_rate": 9.622334012469775e-07, "loss": 3.9819, "step": 2356 }, { "epoch": 1.8632411067193675, "grad_norm": 2.018571376800537, "learning_rate": 9.610626723934236e-07, "loss": 4.1207, "step": 2357 }, { "epoch": 1.8640316205533596, "grad_norm": 2.572662591934204, "learning_rate": 9.598923203786553e-07, "loss": 4.1841, "step": 2358 }, { "epoch": 1.8648221343873517, "grad_norm": 2.196475028991699, "learning_rate": 9.587223460210117e-07, "loss": 3.8661, "step": 2359 }, { "epoch": 1.8656126482213438, "grad_norm": 2.0090110301971436, "learning_rate": 9.575527501385687e-07, "loss": 4.2896, "step": 2360 }, { "epoch": 1.8664031620553359, "grad_norm": 2.2550127506256104, "learning_rate": 9.563835335491382e-07, "loss": 3.933, "step": 2361 }, { "epoch": 1.867193675889328, "grad_norm": 2.1013965606689453, "learning_rate": 9.552146970702654e-07, "loss": 4.142, "step": 2362 }, { "epoch": 1.86798418972332, "grad_norm": 2.0689430236816406, "learning_rate": 9.540462415192319e-07, "loss": 4.2242, "step": 2363 }, { "epoch": 1.8687747035573121, "grad_norm": 2.059093952178955, "learning_rate": 9.528781677130515e-07, "loss": 4.0455, "step": 2364 }, { "epoch": 1.8695652173913042, "grad_norm": 2.3180508613586426, "learning_rate": 9.517104764684705e-07, "loss": 4.0425, "step": 2365 }, { "epoch": 1.8703557312252963, "grad_norm": 2.152240514755249, "learning_rate": 9.505431686019693e-07, "loss": 4.097, "step": 2366 }, { "epoch": 1.8711462450592884, "grad_norm": 2.581190347671509, "learning_rate": 9.493762449297595e-07, "loss": 3.6475, "step": 2367 }, { "epoch": 1.8719367588932805, "grad_norm": 2.117626190185547, "learning_rate": 9.482097062677837e-07, "loss": 4.1026, "step": 2368 }, { "epoch": 1.8727272727272726, "grad_norm": 2.767766237258911, "learning_rate": 9.470435534317158e-07, "loss": 3.8653, "step": 2369 }, { "epoch": 1.8735177865612647, "grad_norm": 2.1659669876098633, "learning_rate": 9.458777872369595e-07, "loss": 4.1656, "step": 2370 }, { "epoch": 1.8743083003952568, "grad_norm": 2.455906629562378, "learning_rate": 9.447124084986481e-07, "loss": 3.8175, "step": 2371 }, { "epoch": 1.875098814229249, "grad_norm": 2.167851448059082, "learning_rate": 9.435474180316453e-07, "loss": 3.8189, "step": 2372 }, { "epoch": 1.8758893280632412, "grad_norm": 2.3047475814819336, "learning_rate": 9.423828166505415e-07, "loss": 3.8919, "step": 2373 }, { "epoch": 1.8766798418972332, "grad_norm": 2.310429573059082, "learning_rate": 9.412186051696559e-07, "loss": 4.0484, "step": 2374 }, { "epoch": 1.8774703557312253, "grad_norm": 2.135955572128296, "learning_rate": 9.400547844030352e-07, "loss": 4.0126, "step": 2375 }, { "epoch": 1.8782608695652174, "grad_norm": 2.099898338317871, "learning_rate": 9.388913551644525e-07, "loss": 4.2439, "step": 2376 }, { "epoch": 1.8790513833992095, "grad_norm": 2.6171088218688965, "learning_rate": 9.37728318267407e-07, "loss": 4.1023, "step": 2377 }, { "epoch": 1.8798418972332016, "grad_norm": 2.595557689666748, "learning_rate": 9.365656745251253e-07, "loss": 3.8599, "step": 2378 }, { "epoch": 1.8806324110671937, "grad_norm": 2.4246561527252197, "learning_rate": 9.354034247505557e-07, "loss": 3.4948, "step": 2379 }, { "epoch": 1.8814229249011858, "grad_norm": 2.8015127182006836, "learning_rate": 9.342415697563742e-07, "loss": 4.0754, "step": 2380 }, { "epoch": 1.8822134387351779, "grad_norm": 2.564699411392212, "learning_rate": 9.330801103549798e-07, "loss": 3.8322, "step": 2381 }, { "epoch": 1.88300395256917, "grad_norm": 2.2290704250335693, "learning_rate": 9.319190473584938e-07, "loss": 3.9982, "step": 2382 }, { "epoch": 1.883794466403162, "grad_norm": 2.0381617546081543, "learning_rate": 9.307583815787613e-07, "loss": 4.1571, "step": 2383 }, { "epoch": 1.8845849802371542, "grad_norm": 2.177039384841919, "learning_rate": 9.295981138273509e-07, "loss": 4.356, "step": 2384 }, { "epoch": 1.8853754940711462, "grad_norm": 1.946447730064392, "learning_rate": 9.284382449155496e-07, "loss": 4.3826, "step": 2385 }, { "epoch": 1.8861660079051383, "grad_norm": 2.3717029094696045, "learning_rate": 9.272787756543685e-07, "loss": 3.7801, "step": 2386 }, { "epoch": 1.8869565217391304, "grad_norm": 2.167964458465576, "learning_rate": 9.261197068545383e-07, "loss": 4.1233, "step": 2387 }, { "epoch": 1.8877470355731225, "grad_norm": 2.1893680095672607, "learning_rate": 9.249610393265091e-07, "loss": 4.119, "step": 2388 }, { "epoch": 1.8885375494071146, "grad_norm": 2.331430196762085, "learning_rate": 9.238027738804509e-07, "loss": 3.8719, "step": 2389 }, { "epoch": 1.8893280632411067, "grad_norm": 2.3979225158691406, "learning_rate": 9.226449113262539e-07, "loss": 4.0221, "step": 2390 }, { "epoch": 1.8901185770750988, "grad_norm": 2.805326223373413, "learning_rate": 9.21487452473523e-07, "loss": 3.7663, "step": 2391 }, { "epoch": 1.8909090909090909, "grad_norm": 2.120180130004883, "learning_rate": 9.203303981315848e-07, "loss": 4.2603, "step": 2392 }, { "epoch": 1.891699604743083, "grad_norm": 2.3680834770202637, "learning_rate": 9.19173749109481e-07, "loss": 3.9146, "step": 2393 }, { "epoch": 1.892490118577075, "grad_norm": 2.1677772998809814, "learning_rate": 9.180175062159697e-07, "loss": 4.1175, "step": 2394 }, { "epoch": 1.8932806324110671, "grad_norm": 2.325105905532837, "learning_rate": 9.168616702595257e-07, "loss": 3.8631, "step": 2395 }, { "epoch": 1.8940711462450592, "grad_norm": 2.0858583450317383, "learning_rate": 9.157062420483399e-07, "loss": 4.4345, "step": 2396 }, { "epoch": 1.8948616600790515, "grad_norm": 2.1251447200775146, "learning_rate": 9.145512223903158e-07, "loss": 3.8915, "step": 2397 }, { "epoch": 1.8956521739130436, "grad_norm": 2.0192372798919678, "learning_rate": 9.133966120930743e-07, "loss": 4.2972, "step": 2398 }, { "epoch": 1.8964426877470357, "grad_norm": 2.3505663871765137, "learning_rate": 9.122424119639478e-07, "loss": 3.8729, "step": 2399 }, { "epoch": 1.8972332015810278, "grad_norm": 2.195124626159668, "learning_rate": 9.110886228099825e-07, "loss": 4.04, "step": 2400 }, { "epoch": 1.89802371541502, "grad_norm": 1.9551937580108643, "learning_rate": 9.099352454379376e-07, "loss": 4.1586, "step": 2401 }, { "epoch": 1.898814229249012, "grad_norm": 3.178328037261963, "learning_rate": 9.087822806542837e-07, "loss": 4.0263, "step": 2402 }, { "epoch": 1.899604743083004, "grad_norm": 2.489710807800293, "learning_rate": 9.076297292652034e-07, "loss": 4.2585, "step": 2403 }, { "epoch": 1.9003952569169962, "grad_norm": 2.282409191131592, "learning_rate": 9.064775920765908e-07, "loss": 3.9903, "step": 2404 }, { "epoch": 1.9011857707509883, "grad_norm": 2.235665798187256, "learning_rate": 9.053258698940485e-07, "loss": 4.1049, "step": 2405 }, { "epoch": 1.9019762845849804, "grad_norm": 2.189316987991333, "learning_rate": 9.041745635228914e-07, "loss": 3.9537, "step": 2406 }, { "epoch": 1.9027667984189724, "grad_norm": 2.295086145401001, "learning_rate": 9.03023673768142e-07, "loss": 4.0327, "step": 2407 }, { "epoch": 1.9035573122529645, "grad_norm": 2.2737746238708496, "learning_rate": 9.018732014345316e-07, "loss": 4.0634, "step": 2408 }, { "epoch": 1.9043478260869566, "grad_norm": 2.3026599884033203, "learning_rate": 9.007231473265e-07, "loss": 3.8435, "step": 2409 }, { "epoch": 1.9051383399209487, "grad_norm": 2.1797807216644287, "learning_rate": 8.995735122481951e-07, "loss": 4.1772, "step": 2410 }, { "epoch": 1.9059288537549408, "grad_norm": 2.215468406677246, "learning_rate": 8.984242970034704e-07, "loss": 3.8154, "step": 2411 }, { "epoch": 1.906719367588933, "grad_norm": 2.1736433506011963, "learning_rate": 8.972755023958865e-07, "loss": 3.6934, "step": 2412 }, { "epoch": 1.907509881422925, "grad_norm": 2.1777684688568115, "learning_rate": 8.961271292287111e-07, "loss": 3.9552, "step": 2413 }, { "epoch": 1.908300395256917, "grad_norm": 2.206012725830078, "learning_rate": 8.949791783049153e-07, "loss": 4.0237, "step": 2414 }, { "epoch": 1.9090909090909092, "grad_norm": 2.1404807567596436, "learning_rate": 8.938316504271759e-07, "loss": 3.9315, "step": 2415 }, { "epoch": 1.9098814229249013, "grad_norm": 2.0977604389190674, "learning_rate": 8.926845463978741e-07, "loss": 4.2401, "step": 2416 }, { "epoch": 1.9106719367588934, "grad_norm": 2.1653847694396973, "learning_rate": 8.915378670190941e-07, "loss": 4.2269, "step": 2417 }, { "epoch": 1.9114624505928854, "grad_norm": 2.816734552383423, "learning_rate": 8.903916130926231e-07, "loss": 4.2001, "step": 2418 }, { "epoch": 1.9122529644268775, "grad_norm": 2.2002222537994385, "learning_rate": 8.892457854199524e-07, "loss": 3.6755, "step": 2419 }, { "epoch": 1.9130434782608696, "grad_norm": 2.4137485027313232, "learning_rate": 8.88100384802273e-07, "loss": 3.8245, "step": 2420 }, { "epoch": 1.9138339920948617, "grad_norm": 2.0766873359680176, "learning_rate": 8.869554120404787e-07, "loss": 4.2607, "step": 2421 }, { "epoch": 1.9146245059288538, "grad_norm": 2.1710076332092285, "learning_rate": 8.858108679351642e-07, "loss": 4.0256, "step": 2422 }, { "epoch": 1.915415019762846, "grad_norm": 1.97329580783844, "learning_rate": 8.846667532866233e-07, "loss": 3.8677, "step": 2423 }, { "epoch": 1.916205533596838, "grad_norm": 2.159311532974243, "learning_rate": 8.835230688948503e-07, "loss": 4.0751, "step": 2424 }, { "epoch": 1.91699604743083, "grad_norm": 2.348723888397217, "learning_rate": 8.823798155595399e-07, "loss": 4.2079, "step": 2425 }, { "epoch": 1.9177865612648222, "grad_norm": 2.1292667388916016, "learning_rate": 8.812369940800822e-07, "loss": 4.1592, "step": 2426 }, { "epoch": 1.9185770750988143, "grad_norm": 1.9525893926620483, "learning_rate": 8.800946052555685e-07, "loss": 4.4368, "step": 2427 }, { "epoch": 1.9193675889328063, "grad_norm": 2.2957558631896973, "learning_rate": 8.789526498847864e-07, "loss": 3.8266, "step": 2428 }, { "epoch": 1.9201581027667984, "grad_norm": 2.1549108028411865, "learning_rate": 8.778111287662199e-07, "loss": 3.9997, "step": 2429 }, { "epoch": 1.9209486166007905, "grad_norm": 2.0405938625335693, "learning_rate": 8.766700426980498e-07, "loss": 4.0376, "step": 2430 }, { "epoch": 1.9217391304347826, "grad_norm": 2.3293967247009277, "learning_rate": 8.755293924781523e-07, "loss": 3.8604, "step": 2431 }, { "epoch": 1.9225296442687747, "grad_norm": 2.516849994659424, "learning_rate": 8.743891789041003e-07, "loss": 3.9099, "step": 2432 }, { "epoch": 1.9233201581027668, "grad_norm": 2.122878074645996, "learning_rate": 8.732494027731601e-07, "loss": 4.045, "step": 2433 }, { "epoch": 1.9241106719367589, "grad_norm": 1.9731781482696533, "learning_rate": 8.721100648822917e-07, "loss": 4.2703, "step": 2434 }, { "epoch": 1.924901185770751, "grad_norm": 2.063896656036377, "learning_rate": 8.709711660281488e-07, "loss": 4.155, "step": 2435 }, { "epoch": 1.925691699604743, "grad_norm": 2.5045249462127686, "learning_rate": 8.698327070070812e-07, "loss": 3.8214, "step": 2436 }, { "epoch": 1.9264822134387352, "grad_norm": 2.212222099304199, "learning_rate": 8.686946886151261e-07, "loss": 3.8739, "step": 2437 }, { "epoch": 1.9272727272727272, "grad_norm": 2.8899219036102295, "learning_rate": 8.675571116480159e-07, "loss": 3.8736, "step": 2438 }, { "epoch": 1.9280632411067193, "grad_norm": 2.274634599685669, "learning_rate": 8.664199769011738e-07, "loss": 4.0197, "step": 2439 }, { "epoch": 1.9288537549407114, "grad_norm": 2.120466947555542, "learning_rate": 8.652832851697131e-07, "loss": 4.0021, "step": 2440 }, { "epoch": 1.9296442687747035, "grad_norm": 2.130730152130127, "learning_rate": 8.641470372484383e-07, "loss": 4.0374, "step": 2441 }, { "epoch": 1.9304347826086956, "grad_norm": 2.1588621139526367, "learning_rate": 8.630112339318433e-07, "loss": 4.0881, "step": 2442 }, { "epoch": 1.9312252964426877, "grad_norm": 2.1039984226226807, "learning_rate": 8.618758760141091e-07, "loss": 4.0828, "step": 2443 }, { "epoch": 1.9320158102766798, "grad_norm": 2.0204381942749023, "learning_rate": 8.607409642891091e-07, "loss": 4.2609, "step": 2444 }, { "epoch": 1.9328063241106719, "grad_norm": 2.20874285697937, "learning_rate": 8.596064995504025e-07, "loss": 4.2172, "step": 2445 }, { "epoch": 1.933596837944664, "grad_norm": 2.3610732555389404, "learning_rate": 8.584724825912348e-07, "loss": 3.5982, "step": 2446 }, { "epoch": 1.934387351778656, "grad_norm": 2.2164735794067383, "learning_rate": 8.573389142045396e-07, "loss": 4.0136, "step": 2447 }, { "epoch": 1.9351778656126482, "grad_norm": 3.266169786453247, "learning_rate": 8.56205795182939e-07, "loss": 3.9622, "step": 2448 }, { "epoch": 1.9359683794466402, "grad_norm": 2.0560128688812256, "learning_rate": 8.550731263187371e-07, "loss": 4.1697, "step": 2449 }, { "epoch": 1.9367588932806323, "grad_norm": 2.2595932483673096, "learning_rate": 8.539409084039253e-07, "loss": 3.8241, "step": 2450 }, { "epoch": 1.9375494071146244, "grad_norm": 2.2191638946533203, "learning_rate": 8.528091422301795e-07, "loss": 3.7592, "step": 2451 }, { "epoch": 1.9383399209486165, "grad_norm": 2.2313506603240967, "learning_rate": 8.516778285888598e-07, "loss": 3.9975, "step": 2452 }, { "epoch": 1.9391304347826086, "grad_norm": 1.9040426015853882, "learning_rate": 8.505469682710094e-07, "loss": 4.5228, "step": 2453 }, { "epoch": 1.9399209486166007, "grad_norm": 2.2184865474700928, "learning_rate": 8.494165620673557e-07, "loss": 4.0918, "step": 2454 }, { "epoch": 1.9407114624505928, "grad_norm": 2.10098934173584, "learning_rate": 8.482866107683053e-07, "loss": 4.3201, "step": 2455 }, { "epoch": 1.9415019762845849, "grad_norm": 2.229576349258423, "learning_rate": 8.471571151639515e-07, "loss": 3.8958, "step": 2456 }, { "epoch": 1.942292490118577, "grad_norm": 2.387669324874878, "learning_rate": 8.460280760440664e-07, "loss": 3.7502, "step": 2457 }, { "epoch": 1.943083003952569, "grad_norm": 2.165020704269409, "learning_rate": 8.448994941981015e-07, "loss": 4.1137, "step": 2458 }, { "epoch": 1.9438735177865611, "grad_norm": 2.382488965988159, "learning_rate": 8.437713704151913e-07, "loss": 3.7992, "step": 2459 }, { "epoch": 1.9446640316205532, "grad_norm": 2.1889424324035645, "learning_rate": 8.426437054841486e-07, "loss": 3.9247, "step": 2460 }, { "epoch": 1.9454545454545453, "grad_norm": 2.241722345352173, "learning_rate": 8.415165001934654e-07, "loss": 3.8966, "step": 2461 }, { "epoch": 1.9462450592885374, "grad_norm": 2.277387857437134, "learning_rate": 8.403897553313134e-07, "loss": 4.2758, "step": 2462 }, { "epoch": 1.9470355731225295, "grad_norm": 2.1806650161743164, "learning_rate": 8.392634716855397e-07, "loss": 3.9811, "step": 2463 }, { "epoch": 1.9478260869565216, "grad_norm": 1.9429703950881958, "learning_rate": 8.381376500436722e-07, "loss": 4.247, "step": 2464 }, { "epoch": 1.9486166007905137, "grad_norm": 2.501371383666992, "learning_rate": 8.370122911929144e-07, "loss": 3.9805, "step": 2465 }, { "epoch": 1.9494071146245058, "grad_norm": 2.2822115421295166, "learning_rate": 8.358873959201447e-07, "loss": 3.7916, "step": 2466 }, { "epoch": 1.9501976284584979, "grad_norm": 2.2651126384735107, "learning_rate": 8.347629650119197e-07, "loss": 3.7905, "step": 2467 }, { "epoch": 1.95098814229249, "grad_norm": 2.073137044906616, "learning_rate": 8.336389992544696e-07, "loss": 4.2503, "step": 2468 }, { "epoch": 1.951778656126482, "grad_norm": 2.4184751510620117, "learning_rate": 8.325154994337007e-07, "loss": 3.7979, "step": 2469 }, { "epoch": 1.9525691699604744, "grad_norm": 2.5469977855682373, "learning_rate": 8.313924663351927e-07, "loss": 3.8956, "step": 2470 }, { "epoch": 1.9533596837944665, "grad_norm": 2.2191429138183594, "learning_rate": 8.302699007441989e-07, "loss": 4.213, "step": 2471 }, { "epoch": 1.9541501976284585, "grad_norm": 2.1375672817230225, "learning_rate": 8.291478034456459e-07, "loss": 3.9721, "step": 2472 }, { "epoch": 1.9549407114624506, "grad_norm": 2.1410768032073975, "learning_rate": 8.280261752241333e-07, "loss": 4.0305, "step": 2473 }, { "epoch": 1.9557312252964427, "grad_norm": 2.7951879501342773, "learning_rate": 8.269050168639322e-07, "loss": 3.984, "step": 2474 }, { "epoch": 1.9565217391304348, "grad_norm": 3.039705991744995, "learning_rate": 8.257843291489846e-07, "loss": 4.0682, "step": 2475 }, { "epoch": 1.957312252964427, "grad_norm": 2.627967357635498, "learning_rate": 8.246641128629041e-07, "loss": 4.0073, "step": 2476 }, { "epoch": 1.958102766798419, "grad_norm": 2.260915756225586, "learning_rate": 8.23544368788976e-07, "loss": 4.0075, "step": 2477 }, { "epoch": 1.958893280632411, "grad_norm": 2.065060615539551, "learning_rate": 8.224250977101524e-07, "loss": 4.3139, "step": 2478 }, { "epoch": 1.9596837944664032, "grad_norm": 2.499053716659546, "learning_rate": 8.213063004090569e-07, "loss": 4.0394, "step": 2479 }, { "epoch": 1.9604743083003953, "grad_norm": 4.75435209274292, "learning_rate": 8.201879776679815e-07, "loss": 3.4943, "step": 2480 }, { "epoch": 1.9612648221343874, "grad_norm": 2.3241379261016846, "learning_rate": 8.190701302688859e-07, "loss": 3.803, "step": 2481 }, { "epoch": 1.9620553359683794, "grad_norm": 2.132819890975952, "learning_rate": 8.179527589933975e-07, "loss": 4.0505, "step": 2482 }, { "epoch": 1.9628458498023715, "grad_norm": 2.3774871826171875, "learning_rate": 8.168358646228116e-07, "loss": 4.0218, "step": 2483 }, { "epoch": 1.9636363636363636, "grad_norm": 2.047264575958252, "learning_rate": 8.157194479380877e-07, "loss": 4.1879, "step": 2484 }, { "epoch": 1.9644268774703557, "grad_norm": 2.44775128364563, "learning_rate": 8.146035097198547e-07, "loss": 3.7663, "step": 2485 }, { "epoch": 1.9652173913043478, "grad_norm": 2.1401493549346924, "learning_rate": 8.134880507484049e-07, "loss": 4.0876, "step": 2486 }, { "epoch": 1.96600790513834, "grad_norm": 3.363077163696289, "learning_rate": 8.12373071803695e-07, "loss": 4.301, "step": 2487 }, { "epoch": 1.966798418972332, "grad_norm": 2.54364275932312, "learning_rate": 8.112585736653466e-07, "loss": 3.7079, "step": 2488 }, { "epoch": 1.967588932806324, "grad_norm": 1.999074101448059, "learning_rate": 8.101445571126474e-07, "loss": 4.2172, "step": 2489 }, { "epoch": 1.9683794466403162, "grad_norm": 2.29371976852417, "learning_rate": 8.090310229245441e-07, "loss": 4.1213, "step": 2490 }, { "epoch": 1.9691699604743083, "grad_norm": 2.136976480484009, "learning_rate": 8.079179718796497e-07, "loss": 4.1031, "step": 2491 }, { "epoch": 1.9699604743083003, "grad_norm": 2.1753642559051514, "learning_rate": 8.068054047562365e-07, "loss": 4.2192, "step": 2492 }, { "epoch": 1.9707509881422924, "grad_norm": 2.095510959625244, "learning_rate": 8.056933223322415e-07, "loss": 3.977, "step": 2493 }, { "epoch": 1.9715415019762847, "grad_norm": 2.2172350883483887, "learning_rate": 8.045817253852608e-07, "loss": 3.9744, "step": 2494 }, { "epoch": 1.9723320158102768, "grad_norm": 2.070446729660034, "learning_rate": 8.034706146925509e-07, "loss": 4.1764, "step": 2495 }, { "epoch": 1.973122529644269, "grad_norm": 2.1798572540283203, "learning_rate": 8.023599910310288e-07, "loss": 3.8235, "step": 2496 }, { "epoch": 1.973913043478261, "grad_norm": 2.115931749343872, "learning_rate": 8.012498551772722e-07, "loss": 4.0506, "step": 2497 }, { "epoch": 1.974703557312253, "grad_norm": 2.150221824645996, "learning_rate": 8.001402079075155e-07, "loss": 3.9295, "step": 2498 }, { "epoch": 1.9754940711462452, "grad_norm": 2.663422107696533, "learning_rate": 7.990310499976526e-07, "loss": 3.9506, "step": 2499 }, { "epoch": 1.9762845849802373, "grad_norm": 2.301938056945801, "learning_rate": 7.979223822232352e-07, "loss": 4.0252, "step": 2500 }, { "epoch": 1.9770750988142294, "grad_norm": 2.2207205295562744, "learning_rate": 7.968142053594724e-07, "loss": 4.0815, "step": 2501 }, { "epoch": 1.9778656126482215, "grad_norm": 2.1706326007843018, "learning_rate": 7.957065201812298e-07, "loss": 3.8996, "step": 2502 }, { "epoch": 1.9786561264822136, "grad_norm": 2.2755930423736572, "learning_rate": 7.945993274630297e-07, "loss": 4.1618, "step": 2503 }, { "epoch": 1.9794466403162057, "grad_norm": 2.177933692932129, "learning_rate": 7.934926279790478e-07, "loss": 4.0298, "step": 2504 }, { "epoch": 1.9802371541501977, "grad_norm": 2.1110849380493164, "learning_rate": 7.923864225031185e-07, "loss": 4.1275, "step": 2505 }, { "epoch": 1.9810276679841898, "grad_norm": 2.5199055671691895, "learning_rate": 7.912807118087293e-07, "loss": 3.6138, "step": 2506 }, { "epoch": 1.981818181818182, "grad_norm": 2.080439805984497, "learning_rate": 7.901754966690202e-07, "loss": 4.0398, "step": 2507 }, { "epoch": 1.982608695652174, "grad_norm": 2.4211034774780273, "learning_rate": 7.890707778567862e-07, "loss": 4.2771, "step": 2508 }, { "epoch": 1.983399209486166, "grad_norm": 2.0961766242980957, "learning_rate": 7.879665561444752e-07, "loss": 4.0046, "step": 2509 }, { "epoch": 1.9841897233201582, "grad_norm": 2.175046443939209, "learning_rate": 7.868628323041874e-07, "loss": 4.1886, "step": 2510 }, { "epoch": 1.9849802371541503, "grad_norm": 2.2305245399475098, "learning_rate": 7.857596071076747e-07, "loss": 4.1442, "step": 2511 }, { "epoch": 1.9857707509881424, "grad_norm": 2.33640456199646, "learning_rate": 7.846568813263402e-07, "loss": 4.213, "step": 2512 }, { "epoch": 1.9865612648221345, "grad_norm": 2.016645908355713, "learning_rate": 7.835546557312384e-07, "loss": 4.2425, "step": 2513 }, { "epoch": 1.9873517786561266, "grad_norm": 2.4839260578155518, "learning_rate": 7.824529310930733e-07, "loss": 3.5964, "step": 2514 }, { "epoch": 1.9881422924901186, "grad_norm": 2.6685054302215576, "learning_rate": 7.813517081822e-07, "loss": 4.1443, "step": 2515 }, { "epoch": 1.9889328063241107, "grad_norm": 2.6335391998291016, "learning_rate": 7.802509877686195e-07, "loss": 3.4251, "step": 2516 }, { "epoch": 1.9897233201581028, "grad_norm": 2.42032527923584, "learning_rate": 7.791507706219857e-07, "loss": 3.7922, "step": 2517 }, { "epoch": 1.990513833992095, "grad_norm": 2.5556752681732178, "learning_rate": 7.780510575115989e-07, "loss": 3.8078, "step": 2518 }, { "epoch": 1.991304347826087, "grad_norm": 2.0778887271881104, "learning_rate": 7.769518492064051e-07, "loss": 4.1669, "step": 2519 }, { "epoch": 1.992094861660079, "grad_norm": 2.3597683906555176, "learning_rate": 7.758531464749997e-07, "loss": 4.0445, "step": 2520 }, { "epoch": 1.9928853754940712, "grad_norm": 3.274254560470581, "learning_rate": 7.747549500856237e-07, "loss": 4.278, "step": 2521 }, { "epoch": 1.9936758893280633, "grad_norm": 2.19840931892395, "learning_rate": 7.736572608061641e-07, "loss": 4.1323, "step": 2522 }, { "epoch": 1.9944664031620554, "grad_norm": 2.127302885055542, "learning_rate": 7.72560079404154e-07, "loss": 4.1923, "step": 2523 }, { "epoch": 1.9952569169960475, "grad_norm": 2.1407992839813232, "learning_rate": 7.71463406646769e-07, "loss": 3.8509, "step": 2524 }, { "epoch": 1.9960474308300395, "grad_norm": 2.0358211994171143, "learning_rate": 7.703672433008327e-07, "loss": 4.0995, "step": 2525 }, { "epoch": 1.9968379446640316, "grad_norm": 2.242879629135132, "learning_rate": 7.692715901328105e-07, "loss": 4.254, "step": 2526 }, { "epoch": 1.9976284584980237, "grad_norm": 2.781536102294922, "learning_rate": 7.681764479088097e-07, "loss": 3.7774, "step": 2527 }, { "epoch": 1.9984189723320158, "grad_norm": 2.25451397895813, "learning_rate": 7.670818173945827e-07, "loss": 4.0247, "step": 2528 }, { "epoch": 1.999209486166008, "grad_norm": 2.059314727783203, "learning_rate": 7.659876993555231e-07, "loss": 4.0681, "step": 2529 }, { "epoch": 2.0, "grad_norm": 2.049372911453247, "learning_rate": 7.648940945566664e-07, "loss": 4.0676, "step": 2530 }, { "epoch": 2.000790513833992, "grad_norm": 2.5186424255371094, "learning_rate": 7.638010037626888e-07, "loss": 3.9528, "step": 2531 }, { "epoch": 2.001581027667984, "grad_norm": 2.010427951812744, "learning_rate": 7.627084277379075e-07, "loss": 4.241, "step": 2532 }, { "epoch": 2.0023715415019763, "grad_norm": 2.3277835845947266, "learning_rate": 7.616163672462797e-07, "loss": 3.8113, "step": 2533 }, { "epoch": 2.0031620553359684, "grad_norm": 2.2545735836029053, "learning_rate": 7.605248230514021e-07, "loss": 3.7671, "step": 2534 }, { "epoch": 2.0039525691699605, "grad_norm": 2.1920158863067627, "learning_rate": 7.594337959165108e-07, "loss": 3.8335, "step": 2535 }, { "epoch": 2.0047430830039525, "grad_norm": 2.260524272918701, "learning_rate": 7.583432866044789e-07, "loss": 3.6129, "step": 2536 }, { "epoch": 2.0055335968379446, "grad_norm": 2.3055973052978516, "learning_rate": 7.572532958778186e-07, "loss": 4.1654, "step": 2537 }, { "epoch": 2.0063241106719367, "grad_norm": 2.2161951065063477, "learning_rate": 7.561638244986805e-07, "loss": 3.8221, "step": 2538 }, { "epoch": 2.007114624505929, "grad_norm": 1.962371826171875, "learning_rate": 7.550748732288498e-07, "loss": 3.9866, "step": 2539 }, { "epoch": 2.007905138339921, "grad_norm": 2.0217130184173584, "learning_rate": 7.539864428297493e-07, "loss": 4.176, "step": 2540 }, { "epoch": 2.008695652173913, "grad_norm": 2.0840904712677, "learning_rate": 7.528985340624377e-07, "loss": 3.9553, "step": 2541 }, { "epoch": 2.009486166007905, "grad_norm": 2.0949409008026123, "learning_rate": 7.518111476876084e-07, "loss": 3.9234, "step": 2542 }, { "epoch": 2.010276679841897, "grad_norm": 2.107720375061035, "learning_rate": 7.507242844655898e-07, "loss": 3.996, "step": 2543 }, { "epoch": 2.0110671936758893, "grad_norm": 2.0685389041900635, "learning_rate": 7.496379451563454e-07, "loss": 4.0754, "step": 2544 }, { "epoch": 2.0118577075098814, "grad_norm": 2.0757534503936768, "learning_rate": 7.485521305194695e-07, "loss": 3.9104, "step": 2545 }, { "epoch": 2.0126482213438734, "grad_norm": 2.0996570587158203, "learning_rate": 7.474668413141934e-07, "loss": 4.1246, "step": 2546 }, { "epoch": 2.0134387351778655, "grad_norm": 2.2191169261932373, "learning_rate": 7.463820782993789e-07, "loss": 3.8396, "step": 2547 }, { "epoch": 2.0142292490118576, "grad_norm": 1.9808927774429321, "learning_rate": 7.45297842233519e-07, "loss": 4.3631, "step": 2548 }, { "epoch": 2.0150197628458497, "grad_norm": 2.177454710006714, "learning_rate": 7.4421413387474e-07, "loss": 4.0013, "step": 2549 }, { "epoch": 2.015810276679842, "grad_norm": 2.037930965423584, "learning_rate": 7.431309539807985e-07, "loss": 4.228, "step": 2550 }, { "epoch": 2.016600790513834, "grad_norm": 2.1270675659179688, "learning_rate": 7.420483033090815e-07, "loss": 4.1164, "step": 2551 }, { "epoch": 2.017391304347826, "grad_norm": 2.385662078857422, "learning_rate": 7.40966182616607e-07, "loss": 4.1021, "step": 2552 }, { "epoch": 2.018181818181818, "grad_norm": 1.999930739402771, "learning_rate": 7.398845926600193e-07, "loss": 4.1901, "step": 2553 }, { "epoch": 2.01897233201581, "grad_norm": 2.339833974838257, "learning_rate": 7.388035341955959e-07, "loss": 3.9349, "step": 2554 }, { "epoch": 2.0197628458498023, "grad_norm": 2.093494415283203, "learning_rate": 7.377230079792406e-07, "loss": 4.157, "step": 2555 }, { "epoch": 2.0205533596837943, "grad_norm": 2.176302194595337, "learning_rate": 7.366430147664835e-07, "loss": 3.6387, "step": 2556 }, { "epoch": 2.0213438735177864, "grad_norm": 2.1041884422302246, "learning_rate": 7.355635553124839e-07, "loss": 4.0071, "step": 2557 }, { "epoch": 2.0221343873517785, "grad_norm": 2.3848698139190674, "learning_rate": 7.344846303720289e-07, "loss": 3.8594, "step": 2558 }, { "epoch": 2.0229249011857706, "grad_norm": 2.3372397422790527, "learning_rate": 7.33406240699529e-07, "loss": 4.0584, "step": 2559 }, { "epoch": 2.0237154150197627, "grad_norm": 2.3239901065826416, "learning_rate": 7.323283870490224e-07, "loss": 3.8461, "step": 2560 }, { "epoch": 2.024505928853755, "grad_norm": 2.0883102416992188, "learning_rate": 7.312510701741718e-07, "loss": 3.8877, "step": 2561 }, { "epoch": 2.025296442687747, "grad_norm": 2.208603620529175, "learning_rate": 7.301742908282648e-07, "loss": 4.0355, "step": 2562 }, { "epoch": 2.026086956521739, "grad_norm": 2.1712558269500732, "learning_rate": 7.290980497642135e-07, "loss": 4.0715, "step": 2563 }, { "epoch": 2.026877470355731, "grad_norm": 2.2471652030944824, "learning_rate": 7.280223477345531e-07, "loss": 3.761, "step": 2564 }, { "epoch": 2.027667984189723, "grad_norm": 2.008190155029297, "learning_rate": 7.26947185491441e-07, "loss": 4.0246, "step": 2565 }, { "epoch": 2.0284584980237153, "grad_norm": 2.1392323970794678, "learning_rate": 7.258725637866594e-07, "loss": 4.1784, "step": 2566 }, { "epoch": 2.0292490118577073, "grad_norm": 2.337752342224121, "learning_rate": 7.247984833716116e-07, "loss": 3.6967, "step": 2567 }, { "epoch": 2.0300395256916994, "grad_norm": 2.2881908416748047, "learning_rate": 7.237249449973209e-07, "loss": 3.5704, "step": 2568 }, { "epoch": 2.0308300395256915, "grad_norm": 2.313084602355957, "learning_rate": 7.226519494144333e-07, "loss": 4.3467, "step": 2569 }, { "epoch": 2.0316205533596836, "grad_norm": 2.0554583072662354, "learning_rate": 7.21579497373215e-07, "loss": 4.1384, "step": 2570 }, { "epoch": 2.0324110671936757, "grad_norm": 1.9852055311203003, "learning_rate": 7.205075896235517e-07, "loss": 4.0109, "step": 2571 }, { "epoch": 2.033201581027668, "grad_norm": 2.005239963531494, "learning_rate": 7.19436226914949e-07, "loss": 4.3993, "step": 2572 }, { "epoch": 2.03399209486166, "grad_norm": 2.0758626461029053, "learning_rate": 7.18365409996531e-07, "loss": 4.035, "step": 2573 }, { "epoch": 2.034782608695652, "grad_norm": 2.139106512069702, "learning_rate": 7.172951396170404e-07, "loss": 4.0516, "step": 2574 }, { "epoch": 2.035573122529644, "grad_norm": 2.0396728515625, "learning_rate": 7.162254165248375e-07, "loss": 4.1433, "step": 2575 }, { "epoch": 2.036363636363636, "grad_norm": 2.2337582111358643, "learning_rate": 7.151562414679009e-07, "loss": 3.8205, "step": 2576 }, { "epoch": 2.0371541501976282, "grad_norm": 2.2419965267181396, "learning_rate": 7.140876151938237e-07, "loss": 3.8195, "step": 2577 }, { "epoch": 2.0379446640316203, "grad_norm": 2.1409718990325928, "learning_rate": 7.130195384498173e-07, "loss": 4.0379, "step": 2578 }, { "epoch": 2.038735177865613, "grad_norm": 2.1437008380889893, "learning_rate": 7.119520119827095e-07, "loss": 4.2017, "step": 2579 }, { "epoch": 2.039525691699605, "grad_norm": 2.083169937133789, "learning_rate": 7.108850365389408e-07, "loss": 4.0308, "step": 2580 }, { "epoch": 2.040316205533597, "grad_norm": 2.3034915924072266, "learning_rate": 7.09818612864568e-07, "loss": 3.9416, "step": 2581 }, { "epoch": 2.041106719367589, "grad_norm": 2.3081564903259277, "learning_rate": 7.087527417052623e-07, "loss": 3.6804, "step": 2582 }, { "epoch": 2.0418972332015812, "grad_norm": 2.4503047466278076, "learning_rate": 7.076874238063077e-07, "loss": 3.9307, "step": 2583 }, { "epoch": 2.0426877470355733, "grad_norm": 2.2071549892425537, "learning_rate": 7.066226599126025e-07, "loss": 4.0839, "step": 2584 }, { "epoch": 2.0434782608695654, "grad_norm": 2.0486133098602295, "learning_rate": 7.055584507686553e-07, "loss": 4.2506, "step": 2585 }, { "epoch": 2.0442687747035575, "grad_norm": 2.0520131587982178, "learning_rate": 7.044947971185899e-07, "loss": 4.1004, "step": 2586 }, { "epoch": 2.0450592885375496, "grad_norm": 2.0991270542144775, "learning_rate": 7.0343169970614e-07, "loss": 3.9517, "step": 2587 }, { "epoch": 2.0458498023715417, "grad_norm": 2.0886788368225098, "learning_rate": 7.023691592746499e-07, "loss": 3.9749, "step": 2588 }, { "epoch": 2.0466403162055338, "grad_norm": 2.052706718444824, "learning_rate": 7.013071765670755e-07, "loss": 4.3397, "step": 2589 }, { "epoch": 2.047430830039526, "grad_norm": 4.597567081451416, "learning_rate": 7.002457523259821e-07, "loss": 3.7609, "step": 2590 }, { "epoch": 2.048221343873518, "grad_norm": 2.317901849746704, "learning_rate": 6.991848872935448e-07, "loss": 3.7314, "step": 2591 }, { "epoch": 2.04901185770751, "grad_norm": 2.138231039047241, "learning_rate": 6.981245822115477e-07, "loss": 4.1922, "step": 2592 }, { "epoch": 2.049802371541502, "grad_norm": 2.295841693878174, "learning_rate": 6.970648378213833e-07, "loss": 3.6068, "step": 2593 }, { "epoch": 2.0505928853754942, "grad_norm": 2.277991533279419, "learning_rate": 6.960056548640518e-07, "loss": 3.9025, "step": 2594 }, { "epoch": 2.0513833992094863, "grad_norm": 2.1784396171569824, "learning_rate": 6.949470340801613e-07, "loss": 3.7666, "step": 2595 }, { "epoch": 2.0521739130434784, "grad_norm": 2.0775437355041504, "learning_rate": 6.938889762099271e-07, "loss": 3.9904, "step": 2596 }, { "epoch": 2.0529644268774705, "grad_norm": 1.9902757406234741, "learning_rate": 6.92831481993169e-07, "loss": 4.1285, "step": 2597 }, { "epoch": 2.0537549407114626, "grad_norm": 2.097541332244873, "learning_rate": 6.917745521693144e-07, "loss": 3.8846, "step": 2598 }, { "epoch": 2.0545454545454547, "grad_norm": 2.2123894691467285, "learning_rate": 6.907181874773972e-07, "loss": 4.0104, "step": 2599 }, { "epoch": 2.0553359683794468, "grad_norm": 2.051628589630127, "learning_rate": 6.896623886560528e-07, "loss": 4.3143, "step": 2600 }, { "epoch": 2.056126482213439, "grad_norm": 2.1457600593566895, "learning_rate": 6.886071564435236e-07, "loss": 3.7428, "step": 2601 }, { "epoch": 2.056916996047431, "grad_norm": 4.291740894317627, "learning_rate": 6.875524915776549e-07, "loss": 3.9391, "step": 2602 }, { "epoch": 2.057707509881423, "grad_norm": 2.313361406326294, "learning_rate": 6.864983947958956e-07, "loss": 3.9036, "step": 2603 }, { "epoch": 2.058498023715415, "grad_norm": 2.1482081413269043, "learning_rate": 6.854448668352968e-07, "loss": 4.0205, "step": 2604 }, { "epoch": 2.059288537549407, "grad_norm": 2.1445353031158447, "learning_rate": 6.843919084325131e-07, "loss": 3.9359, "step": 2605 }, { "epoch": 2.0600790513833993, "grad_norm": 1.936003565788269, "learning_rate": 6.833395203237979e-07, "loss": 4.2037, "step": 2606 }, { "epoch": 2.0608695652173914, "grad_norm": 1.807455062866211, "learning_rate": 6.822877032450101e-07, "loss": 4.3576, "step": 2607 }, { "epoch": 2.0616600790513835, "grad_norm": 2.1759517192840576, "learning_rate": 6.812364579316069e-07, "loss": 3.8551, "step": 2608 }, { "epoch": 2.0624505928853756, "grad_norm": 2.350123167037964, "learning_rate": 6.801857851186449e-07, "loss": 4.1217, "step": 2609 }, { "epoch": 2.0632411067193677, "grad_norm": 2.1032676696777344, "learning_rate": 6.791356855407819e-07, "loss": 4.0238, "step": 2610 }, { "epoch": 2.0640316205533598, "grad_norm": 1.949480652809143, "learning_rate": 6.780861599322744e-07, "loss": 4.0451, "step": 2611 }, { "epoch": 2.064822134387352, "grad_norm": 2.224256992340088, "learning_rate": 6.770372090269778e-07, "loss": 4.0767, "step": 2612 }, { "epoch": 2.065612648221344, "grad_norm": 2.6317191123962402, "learning_rate": 6.759888335583458e-07, "loss": 3.8721, "step": 2613 }, { "epoch": 2.066403162055336, "grad_norm": 2.5325849056243896, "learning_rate": 6.749410342594278e-07, "loss": 3.3738, "step": 2614 }, { "epoch": 2.067193675889328, "grad_norm": 2.363382577896118, "learning_rate": 6.738938118628736e-07, "loss": 3.8319, "step": 2615 }, { "epoch": 2.06798418972332, "grad_norm": 2.1365323066711426, "learning_rate": 6.72847167100928e-07, "loss": 3.7785, "step": 2616 }, { "epoch": 2.0687747035573123, "grad_norm": 2.4641144275665283, "learning_rate": 6.718011007054305e-07, "loss": 4.0267, "step": 2617 }, { "epoch": 2.0695652173913044, "grad_norm": 2.4287562370300293, "learning_rate": 6.707556134078177e-07, "loss": 3.8412, "step": 2618 }, { "epoch": 2.0703557312252965, "grad_norm": 1.935541033744812, "learning_rate": 6.697107059391226e-07, "loss": 4.2741, "step": 2619 }, { "epoch": 2.0711462450592886, "grad_norm": 2.33475661277771, "learning_rate": 6.686663790299697e-07, "loss": 3.8857, "step": 2620 }, { "epoch": 2.0719367588932807, "grad_norm": 2.2207083702087402, "learning_rate": 6.676226334105796e-07, "loss": 4.1251, "step": 2621 }, { "epoch": 2.0727272727272728, "grad_norm": 2.0099565982818604, "learning_rate": 6.665794698107662e-07, "loss": 4.0004, "step": 2622 }, { "epoch": 2.073517786561265, "grad_norm": 2.3481411933898926, "learning_rate": 6.65536888959936e-07, "loss": 3.9541, "step": 2623 }, { "epoch": 2.074308300395257, "grad_norm": 2.7546629905700684, "learning_rate": 6.644948915870884e-07, "loss": 3.9916, "step": 2624 }, { "epoch": 2.075098814229249, "grad_norm": 2.271188974380493, "learning_rate": 6.63453478420815e-07, "loss": 3.8678, "step": 2625 }, { "epoch": 2.075889328063241, "grad_norm": 2.385753631591797, "learning_rate": 6.624126501892971e-07, "loss": 3.8596, "step": 2626 }, { "epoch": 2.076679841897233, "grad_norm": 2.0323269367218018, "learning_rate": 6.613724076203099e-07, "loss": 4.3079, "step": 2627 }, { "epoch": 2.0774703557312253, "grad_norm": 2.1008145809173584, "learning_rate": 6.603327514412178e-07, "loss": 3.6516, "step": 2628 }, { "epoch": 2.0782608695652174, "grad_norm": 2.018075704574585, "learning_rate": 6.59293682378974e-07, "loss": 4.1657, "step": 2629 }, { "epoch": 2.0790513833992095, "grad_norm": 2.1176459789276123, "learning_rate": 6.582552011601226e-07, "loss": 4.2393, "step": 2630 }, { "epoch": 2.0798418972332016, "grad_norm": 2.207967758178711, "learning_rate": 6.572173085107964e-07, "loss": 4.1351, "step": 2631 }, { "epoch": 2.0806324110671937, "grad_norm": 2.3258626461029053, "learning_rate": 6.561800051567163e-07, "loss": 3.9314, "step": 2632 }, { "epoch": 2.0814229249011857, "grad_norm": 2.3027796745300293, "learning_rate": 6.551432918231916e-07, "loss": 4.0459, "step": 2633 }, { "epoch": 2.082213438735178, "grad_norm": 2.1920151710510254, "learning_rate": 6.541071692351188e-07, "loss": 4.2841, "step": 2634 }, { "epoch": 2.08300395256917, "grad_norm": 2.0923256874084473, "learning_rate": 6.530716381169813e-07, "loss": 4.1155, "step": 2635 }, { "epoch": 2.083794466403162, "grad_norm": 2.0322725772857666, "learning_rate": 6.52036699192849e-07, "loss": 4.1235, "step": 2636 }, { "epoch": 2.084584980237154, "grad_norm": 2.314418315887451, "learning_rate": 6.510023531863782e-07, "loss": 3.941, "step": 2637 }, { "epoch": 2.085375494071146, "grad_norm": 2.0449397563934326, "learning_rate": 6.499686008208092e-07, "loss": 4.1015, "step": 2638 }, { "epoch": 2.0861660079051383, "grad_norm": 2.5483036041259766, "learning_rate": 6.489354428189684e-07, "loss": 3.5965, "step": 2639 }, { "epoch": 2.0869565217391304, "grad_norm": 2.0112524032592773, "learning_rate": 6.479028799032664e-07, "loss": 4.1641, "step": 2640 }, { "epoch": 2.0877470355731225, "grad_norm": 2.3373377323150635, "learning_rate": 6.46870912795698e-07, "loss": 3.662, "step": 2641 }, { "epoch": 2.0885375494071146, "grad_norm": 2.105665683746338, "learning_rate": 6.458395422178406e-07, "loss": 4.1848, "step": 2642 }, { "epoch": 2.0893280632411066, "grad_norm": 2.2729222774505615, "learning_rate": 6.448087688908552e-07, "loss": 3.7477, "step": 2643 }, { "epoch": 2.0901185770750987, "grad_norm": 2.3161981105804443, "learning_rate": 6.437785935354847e-07, "loss": 4.1025, "step": 2644 }, { "epoch": 2.090909090909091, "grad_norm": 2.125169038772583, "learning_rate": 6.42749016872055e-07, "loss": 4.0388, "step": 2645 }, { "epoch": 2.091699604743083, "grad_norm": 2.3133440017700195, "learning_rate": 6.417200396204708e-07, "loss": 3.925, "step": 2646 }, { "epoch": 2.092490118577075, "grad_norm": 2.1882100105285645, "learning_rate": 6.4069166250022e-07, "loss": 3.8, "step": 2647 }, { "epoch": 2.093280632411067, "grad_norm": 2.696091413497925, "learning_rate": 6.396638862303712e-07, "loss": 3.8034, "step": 2648 }, { "epoch": 2.094071146245059, "grad_norm": 2.1593940258026123, "learning_rate": 6.386367115295709e-07, "loss": 3.9644, "step": 2649 }, { "epoch": 2.0948616600790513, "grad_norm": 2.1106767654418945, "learning_rate": 6.376101391160461e-07, "loss": 4.1268, "step": 2650 }, { "epoch": 2.0956521739130434, "grad_norm": 2.225825548171997, "learning_rate": 6.365841697076028e-07, "loss": 4.145, "step": 2651 }, { "epoch": 2.0964426877470355, "grad_norm": 2.163008689880371, "learning_rate": 6.355588040216248e-07, "loss": 4.015, "step": 2652 }, { "epoch": 2.0972332015810276, "grad_norm": 2.290705680847168, "learning_rate": 6.345340427750743e-07, "loss": 4.0565, "step": 2653 }, { "epoch": 2.0980237154150196, "grad_norm": 1.9997042417526245, "learning_rate": 6.335098866844904e-07, "loss": 4.1829, "step": 2654 }, { "epoch": 2.0988142292490117, "grad_norm": 2.279648780822754, "learning_rate": 6.324863364659894e-07, "loss": 3.7304, "step": 2655 }, { "epoch": 2.099604743083004, "grad_norm": 2.118682622909546, "learning_rate": 6.314633928352638e-07, "loss": 4.1929, "step": 2656 }, { "epoch": 2.100395256916996, "grad_norm": 5.272444725036621, "learning_rate": 6.304410565075826e-07, "loss": 4.1288, "step": 2657 }, { "epoch": 2.101185770750988, "grad_norm": 2.296022653579712, "learning_rate": 6.294193281977884e-07, "loss": 3.6022, "step": 2658 }, { "epoch": 2.10197628458498, "grad_norm": 2.0946223735809326, "learning_rate": 6.283982086202996e-07, "loss": 3.8445, "step": 2659 }, { "epoch": 2.102766798418972, "grad_norm": 2.2878336906433105, "learning_rate": 6.273776984891114e-07, "loss": 3.959, "step": 2660 }, { "epoch": 2.1035573122529643, "grad_norm": 2.2496211528778076, "learning_rate": 6.263577985177883e-07, "loss": 4.2284, "step": 2661 }, { "epoch": 2.1043478260869564, "grad_norm": 2.060621500015259, "learning_rate": 6.253385094194715e-07, "loss": 4.2485, "step": 2662 }, { "epoch": 2.1051383399209485, "grad_norm": 2.2456696033477783, "learning_rate": 6.243198319068738e-07, "loss": 3.9329, "step": 2663 }, { "epoch": 2.1059288537549405, "grad_norm": 2.1840338706970215, "learning_rate": 6.233017666922806e-07, "loss": 3.7256, "step": 2664 }, { "epoch": 2.1067193675889326, "grad_norm": 2.0171422958374023, "learning_rate": 6.222843144875493e-07, "loss": 4.215, "step": 2665 }, { "epoch": 2.1075098814229247, "grad_norm": 2.0179247856140137, "learning_rate": 6.212674760041092e-07, "loss": 4.2713, "step": 2666 }, { "epoch": 2.108300395256917, "grad_norm": 2.2149100303649902, "learning_rate": 6.202512519529579e-07, "loss": 3.6865, "step": 2667 }, { "epoch": 2.109090909090909, "grad_norm": 2.2903504371643066, "learning_rate": 6.192356430446672e-07, "loss": 3.7825, "step": 2668 }, { "epoch": 2.109881422924901, "grad_norm": 2.1778054237365723, "learning_rate": 6.182206499893765e-07, "loss": 3.5899, "step": 2669 }, { "epoch": 2.110671936758893, "grad_norm": 2.0226094722747803, "learning_rate": 6.172062734967942e-07, "loss": 4.1278, "step": 2670 }, { "epoch": 2.111462450592885, "grad_norm": 2.3908228874206543, "learning_rate": 6.161925142761988e-07, "loss": 3.7641, "step": 2671 }, { "epoch": 2.1122529644268773, "grad_norm": 1.9532095193862915, "learning_rate": 6.151793730364365e-07, "loss": 4.2634, "step": 2672 }, { "epoch": 2.1130434782608694, "grad_norm": 2.2630867958068848, "learning_rate": 6.14166850485922e-07, "loss": 3.8913, "step": 2673 }, { "epoch": 2.1138339920948614, "grad_norm": 2.0239953994750977, "learning_rate": 6.131549473326372e-07, "loss": 3.9001, "step": 2674 }, { "epoch": 2.1146245059288535, "grad_norm": 2.224271774291992, "learning_rate": 6.121436642841291e-07, "loss": 3.8104, "step": 2675 }, { "epoch": 2.115415019762846, "grad_norm": 2.3566489219665527, "learning_rate": 6.111330020475144e-07, "loss": 3.6665, "step": 2676 }, { "epoch": 2.1162055335968377, "grad_norm": 1.8807474374771118, "learning_rate": 6.101229613294742e-07, "loss": 4.4066, "step": 2677 }, { "epoch": 2.1169960474308303, "grad_norm": 2.692246198654175, "learning_rate": 6.091135428362536e-07, "loss": 3.9122, "step": 2678 }, { "epoch": 2.1177865612648223, "grad_norm": 2.108886241912842, "learning_rate": 6.081047472736639e-07, "loss": 3.7875, "step": 2679 }, { "epoch": 2.1185770750988144, "grad_norm": 2.503558874130249, "learning_rate": 6.070965753470824e-07, "loss": 3.5141, "step": 2680 }, { "epoch": 2.1193675889328065, "grad_norm": 2.4430716037750244, "learning_rate": 6.06089027761447e-07, "loss": 4.1648, "step": 2681 }, { "epoch": 2.1201581027667986, "grad_norm": 2.0723657608032227, "learning_rate": 6.050821052212616e-07, "loss": 4.1743, "step": 2682 }, { "epoch": 2.1209486166007907, "grad_norm": 2.138986110687256, "learning_rate": 6.040758084305919e-07, "loss": 3.9358, "step": 2683 }, { "epoch": 2.121739130434783, "grad_norm": 2.1554646492004395, "learning_rate": 6.030701380930667e-07, "loss": 3.9073, "step": 2684 }, { "epoch": 2.122529644268775, "grad_norm": 2.0839650630950928, "learning_rate": 6.020650949118763e-07, "loss": 4.0808, "step": 2685 }, { "epoch": 2.123320158102767, "grad_norm": 2.0418930053710938, "learning_rate": 6.010606795897731e-07, "loss": 4.0651, "step": 2686 }, { "epoch": 2.124110671936759, "grad_norm": 1.991193175315857, "learning_rate": 6.000568928290687e-07, "loss": 4.2772, "step": 2687 }, { "epoch": 2.124901185770751, "grad_norm": 2.170978307723999, "learning_rate": 5.990537353316379e-07, "loss": 4.0613, "step": 2688 }, { "epoch": 2.1256916996047432, "grad_norm": 1.935886263847351, "learning_rate": 5.980512077989139e-07, "loss": 4.2512, "step": 2689 }, { "epoch": 2.1264822134387353, "grad_norm": 2.014779806137085, "learning_rate": 5.970493109318891e-07, "loss": 3.9929, "step": 2690 }, { "epoch": 2.1272727272727274, "grad_norm": 2.0326521396636963, "learning_rate": 5.960480454311156e-07, "loss": 4.1984, "step": 2691 }, { "epoch": 2.1280632411067195, "grad_norm": 2.1586740016937256, "learning_rate": 5.950474119967039e-07, "loss": 3.9606, "step": 2692 }, { "epoch": 2.1288537549407116, "grad_norm": 2.116882801055908, "learning_rate": 5.940474113283229e-07, "loss": 3.9308, "step": 2693 }, { "epoch": 2.1296442687747037, "grad_norm": 2.3141119480133057, "learning_rate": 5.930480441251982e-07, "loss": 3.7885, "step": 2694 }, { "epoch": 2.130434782608696, "grad_norm": 2.5389723777770996, "learning_rate": 5.920493110861131e-07, "loss": 3.5363, "step": 2695 }, { "epoch": 2.131225296442688, "grad_norm": 2.1043100357055664, "learning_rate": 5.910512129094075e-07, "loss": 3.9704, "step": 2696 }, { "epoch": 2.13201581027668, "grad_norm": 2.150761842727661, "learning_rate": 5.90053750292977e-07, "loss": 4.1531, "step": 2697 }, { "epoch": 2.132806324110672, "grad_norm": 2.012230634689331, "learning_rate": 5.890569239342737e-07, "loss": 4.2193, "step": 2698 }, { "epoch": 2.133596837944664, "grad_norm": 1.9508051872253418, "learning_rate": 5.880607345303029e-07, "loss": 4.3082, "step": 2699 }, { "epoch": 2.1343873517786562, "grad_norm": 2.2286088466644287, "learning_rate": 5.870651827776264e-07, "loss": 3.8756, "step": 2700 }, { "epoch": 2.1351778656126483, "grad_norm": 2.371452808380127, "learning_rate": 5.860702693723594e-07, "loss": 3.8349, "step": 2701 }, { "epoch": 2.1359683794466404, "grad_norm": 2.155764102935791, "learning_rate": 5.850759950101711e-07, "loss": 3.8394, "step": 2702 }, { "epoch": 2.1367588932806325, "grad_norm": 2.013322353363037, "learning_rate": 5.840823603862832e-07, "loss": 4.4297, "step": 2703 }, { "epoch": 2.1375494071146246, "grad_norm": 2.235630989074707, "learning_rate": 5.830893661954708e-07, "loss": 3.7892, "step": 2704 }, { "epoch": 2.1383399209486167, "grad_norm": 2.112055540084839, "learning_rate": 5.820970131320607e-07, "loss": 4.0755, "step": 2705 }, { "epoch": 2.139130434782609, "grad_norm": 2.403226613998413, "learning_rate": 5.811053018899322e-07, "loss": 3.9177, "step": 2706 }, { "epoch": 2.139920948616601, "grad_norm": 2.0077171325683594, "learning_rate": 5.801142331625142e-07, "loss": 4.0885, "step": 2707 }, { "epoch": 2.140711462450593, "grad_norm": 2.1677193641662598, "learning_rate": 5.79123807642787e-07, "loss": 3.8611, "step": 2708 }, { "epoch": 2.141501976284585, "grad_norm": 2.206362247467041, "learning_rate": 5.781340260232836e-07, "loss": 3.8635, "step": 2709 }, { "epoch": 2.142292490118577, "grad_norm": 1.9233723878860474, "learning_rate": 5.771448889960826e-07, "loss": 4.2394, "step": 2710 }, { "epoch": 2.1430830039525692, "grad_norm": 2.3603668212890625, "learning_rate": 5.761563972528149e-07, "loss": 3.6375, "step": 2711 }, { "epoch": 2.1438735177865613, "grad_norm": 2.284712791442871, "learning_rate": 5.751685514846588e-07, "loss": 3.6737, "step": 2712 }, { "epoch": 2.1446640316205534, "grad_norm": 2.1165506839752197, "learning_rate": 5.741813523823417e-07, "loss": 4.0545, "step": 2713 }, { "epoch": 2.1454545454545455, "grad_norm": 2.1016337871551514, "learning_rate": 5.731948006361384e-07, "loss": 4.0206, "step": 2714 }, { "epoch": 2.1462450592885376, "grad_norm": 2.006587266921997, "learning_rate": 5.722088969358716e-07, "loss": 4.3306, "step": 2715 }, { "epoch": 2.1470355731225297, "grad_norm": 2.87530517578125, "learning_rate": 5.71223641970909e-07, "loss": 3.8678, "step": 2716 }, { "epoch": 2.1478260869565218, "grad_norm": 2.09334397315979, "learning_rate": 5.702390364301677e-07, "loss": 3.9614, "step": 2717 }, { "epoch": 2.148616600790514, "grad_norm": 2.0853214263916016, "learning_rate": 5.69255081002109e-07, "loss": 4.0752, "step": 2718 }, { "epoch": 2.149407114624506, "grad_norm": 2.23056697845459, "learning_rate": 5.682717763747386e-07, "loss": 3.6756, "step": 2719 }, { "epoch": 2.150197628458498, "grad_norm": 2.141388177871704, "learning_rate": 5.672891232356084e-07, "loss": 3.9781, "step": 2720 }, { "epoch": 2.15098814229249, "grad_norm": 2.1534762382507324, "learning_rate": 5.663071222718162e-07, "loss": 4.0761, "step": 2721 }, { "epoch": 2.1517786561264822, "grad_norm": 2.1363346576690674, "learning_rate": 5.653257741700008e-07, "loss": 4.1296, "step": 2722 }, { "epoch": 2.1525691699604743, "grad_norm": 2.219752073287964, "learning_rate": 5.643450796163463e-07, "loss": 4.0392, "step": 2723 }, { "epoch": 2.1533596837944664, "grad_norm": 1.9064180850982666, "learning_rate": 5.633650392965794e-07, "loss": 4.1133, "step": 2724 }, { "epoch": 2.1541501976284585, "grad_norm": 2.1978135108947754, "learning_rate": 5.623856538959695e-07, "loss": 3.8134, "step": 2725 }, { "epoch": 2.1549407114624506, "grad_norm": 2.194403648376465, "learning_rate": 5.614069240993279e-07, "loss": 3.9821, "step": 2726 }, { "epoch": 2.1557312252964427, "grad_norm": 2.0340182781219482, "learning_rate": 5.604288505910081e-07, "loss": 4.036, "step": 2727 }, { "epoch": 2.1565217391304348, "grad_norm": 1.9155381917953491, "learning_rate": 5.594514340549027e-07, "loss": 4.4123, "step": 2728 }, { "epoch": 2.157312252964427, "grad_norm": 2.233614683151245, "learning_rate": 5.584746751744477e-07, "loss": 3.8464, "step": 2729 }, { "epoch": 2.158102766798419, "grad_norm": 2.409108877182007, "learning_rate": 5.574985746326181e-07, "loss": 3.8896, "step": 2730 }, { "epoch": 2.158893280632411, "grad_norm": 2.345953941345215, "learning_rate": 5.565231331119274e-07, "loss": 3.7563, "step": 2731 }, { "epoch": 2.159683794466403, "grad_norm": 2.085721015930176, "learning_rate": 5.555483512944296e-07, "loss": 4.1672, "step": 2732 }, { "epoch": 2.160474308300395, "grad_norm": 1.9490658044815063, "learning_rate": 5.545742298617173e-07, "loss": 4.2358, "step": 2733 }, { "epoch": 2.1612648221343873, "grad_norm": 2.081501007080078, "learning_rate": 5.536007694949212e-07, "loss": 4.017, "step": 2734 }, { "epoch": 2.1620553359683794, "grad_norm": 2.2222087383270264, "learning_rate": 5.5262797087471e-07, "loss": 3.9352, "step": 2735 }, { "epoch": 2.1628458498023715, "grad_norm": 2.064795970916748, "learning_rate": 5.516558346812881e-07, "loss": 4.1594, "step": 2736 }, { "epoch": 2.1636363636363636, "grad_norm": 2.2836456298828125, "learning_rate": 5.506843615943995e-07, "loss": 4.0158, "step": 2737 }, { "epoch": 2.1644268774703557, "grad_norm": 2.0136311054229736, "learning_rate": 5.49713552293323e-07, "loss": 3.9613, "step": 2738 }, { "epoch": 2.1652173913043478, "grad_norm": 2.2173690795898438, "learning_rate": 5.487434074568725e-07, "loss": 3.7098, "step": 2739 }, { "epoch": 2.16600790513834, "grad_norm": 2.0859196186065674, "learning_rate": 5.477739277633978e-07, "loss": 4.102, "step": 2740 }, { "epoch": 2.166798418972332, "grad_norm": 2.2844343185424805, "learning_rate": 5.468051138907857e-07, "loss": 4.0288, "step": 2741 }, { "epoch": 2.167588932806324, "grad_norm": 2.684349775314331, "learning_rate": 5.458369665164543e-07, "loss": 4.2053, "step": 2742 }, { "epoch": 2.168379446640316, "grad_norm": 2.1907100677490234, "learning_rate": 5.448694863173571e-07, "loss": 4.0794, "step": 2743 }, { "epoch": 2.169169960474308, "grad_norm": 2.011517286300659, "learning_rate": 5.439026739699815e-07, "loss": 4.1371, "step": 2744 }, { "epoch": 2.1699604743083003, "grad_norm": 2.132842779159546, "learning_rate": 5.429365301503472e-07, "loss": 3.915, "step": 2745 }, { "epoch": 2.1707509881422924, "grad_norm": 2.119403839111328, "learning_rate": 5.419710555340066e-07, "loss": 4.1658, "step": 2746 }, { "epoch": 2.1715415019762845, "grad_norm": 2.3849949836730957, "learning_rate": 5.410062507960452e-07, "loss": 3.4702, "step": 2747 }, { "epoch": 2.1723320158102766, "grad_norm": 2.3973073959350586, "learning_rate": 5.400421166110772e-07, "loss": 4.1224, "step": 2748 }, { "epoch": 2.1731225296442687, "grad_norm": 2.067288637161255, "learning_rate": 5.390786536532517e-07, "loss": 3.8625, "step": 2749 }, { "epoch": 2.1739130434782608, "grad_norm": 2.077786922454834, "learning_rate": 5.381158625962467e-07, "loss": 4.1235, "step": 2750 }, { "epoch": 2.174703557312253, "grad_norm": 2.039048433303833, "learning_rate": 5.371537441132692e-07, "loss": 4.1707, "step": 2751 }, { "epoch": 2.175494071146245, "grad_norm": 2.246257781982422, "learning_rate": 5.361922988770577e-07, "loss": 3.7969, "step": 2752 }, { "epoch": 2.176284584980237, "grad_norm": 2.0647668838500977, "learning_rate": 5.352315275598793e-07, "loss": 4.0009, "step": 2753 }, { "epoch": 2.177075098814229, "grad_norm": 2.1238162517547607, "learning_rate": 5.342714308335301e-07, "loss": 4.0256, "step": 2754 }, { "epoch": 2.177865612648221, "grad_norm": 2.08393931388855, "learning_rate": 5.333120093693341e-07, "loss": 4.1663, "step": 2755 }, { "epoch": 2.1786561264822133, "grad_norm": 2.06774640083313, "learning_rate": 5.323532638381433e-07, "loss": 4.032, "step": 2756 }, { "epoch": 2.1794466403162054, "grad_norm": 2.266786575317383, "learning_rate": 5.313951949103373e-07, "loss": 3.9147, "step": 2757 }, { "epoch": 2.1802371541501975, "grad_norm": 1.9849997758865356, "learning_rate": 5.304378032558224e-07, "loss": 4.0763, "step": 2758 }, { "epoch": 2.1810276679841896, "grad_norm": 2.555131435394287, "learning_rate": 5.294810895440319e-07, "loss": 3.6995, "step": 2759 }, { "epoch": 2.1818181818181817, "grad_norm": 2.2556586265563965, "learning_rate": 5.285250544439234e-07, "loss": 3.9816, "step": 2760 }, { "epoch": 2.1826086956521737, "grad_norm": 3.007000207901001, "learning_rate": 5.275696986239819e-07, "loss": 3.8117, "step": 2761 }, { "epoch": 2.183399209486166, "grad_norm": 2.295405387878418, "learning_rate": 5.266150227522163e-07, "loss": 3.875, "step": 2762 }, { "epoch": 2.184189723320158, "grad_norm": 1.9985212087631226, "learning_rate": 5.256610274961607e-07, "loss": 4.1833, "step": 2763 }, { "epoch": 2.18498023715415, "grad_norm": 2.2089669704437256, "learning_rate": 5.247077135228731e-07, "loss": 4.0765, "step": 2764 }, { "epoch": 2.185770750988142, "grad_norm": 2.1012492179870605, "learning_rate": 5.237550814989348e-07, "loss": 4.1318, "step": 2765 }, { "epoch": 2.186561264822134, "grad_norm": 2.152832508087158, "learning_rate": 5.228031320904505e-07, "loss": 3.9711, "step": 2766 }, { "epoch": 2.1873517786561263, "grad_norm": 2.5962209701538086, "learning_rate": 5.218518659630482e-07, "loss": 3.9697, "step": 2767 }, { "epoch": 2.1881422924901184, "grad_norm": 2.09517240524292, "learning_rate": 5.209012837818766e-07, "loss": 3.9641, "step": 2768 }, { "epoch": 2.1889328063241105, "grad_norm": 2.091139554977417, "learning_rate": 5.199513862116071e-07, "loss": 3.7524, "step": 2769 }, { "epoch": 2.1897233201581026, "grad_norm": 2.3322904109954834, "learning_rate": 5.190021739164338e-07, "loss": 3.6475, "step": 2770 }, { "epoch": 2.190513833992095, "grad_norm": 2.066211462020874, "learning_rate": 5.180536475600689e-07, "loss": 4.2026, "step": 2771 }, { "epoch": 2.1913043478260867, "grad_norm": 2.0893547534942627, "learning_rate": 5.171058078057466e-07, "loss": 3.9548, "step": 2772 }, { "epoch": 2.1920948616600793, "grad_norm": 2.260164499282837, "learning_rate": 5.161586553162212e-07, "loss": 4.0597, "step": 2773 }, { "epoch": 2.192885375494071, "grad_norm": 2.2307674884796143, "learning_rate": 5.152121907537653e-07, "loss": 3.886, "step": 2774 }, { "epoch": 2.1936758893280635, "grad_norm": 2.121661424636841, "learning_rate": 5.142664147801718e-07, "loss": 4.1726, "step": 2775 }, { "epoch": 2.194466403162055, "grad_norm": 2.0772173404693604, "learning_rate": 5.133213280567516e-07, "loss": 4.0767, "step": 2776 }, { "epoch": 2.1952569169960476, "grad_norm": 2.07958722114563, "learning_rate": 5.123769312443323e-07, "loss": 4.0442, "step": 2777 }, { "epoch": 2.1960474308300397, "grad_norm": 1.9662023782730103, "learning_rate": 5.114332250032616e-07, "loss": 3.9205, "step": 2778 }, { "epoch": 2.196837944664032, "grad_norm": 2.0575475692749023, "learning_rate": 5.104902099934032e-07, "loss": 4.0861, "step": 2779 }, { "epoch": 2.197628458498024, "grad_norm": 2.10203218460083, "learning_rate": 5.095478868741364e-07, "loss": 3.9865, "step": 2780 }, { "epoch": 2.198418972332016, "grad_norm": 2.5733797550201416, "learning_rate": 5.086062563043575e-07, "loss": 3.898, "step": 2781 }, { "epoch": 2.199209486166008, "grad_norm": 2.12375807762146, "learning_rate": 5.076653189424804e-07, "loss": 4.1125, "step": 2782 }, { "epoch": 2.2, "grad_norm": 2.077418565750122, "learning_rate": 5.067250754464311e-07, "loss": 4.1011, "step": 2783 }, { "epoch": 2.2007905138339923, "grad_norm": 2.0687601566314697, "learning_rate": 5.057855264736521e-07, "loss": 4.0437, "step": 2784 }, { "epoch": 2.2015810276679844, "grad_norm": 2.195610284805298, "learning_rate": 5.048466726811006e-07, "loss": 4.0339, "step": 2785 }, { "epoch": 2.2023715415019764, "grad_norm": 2.1575779914855957, "learning_rate": 5.03908514725247e-07, "loss": 3.9031, "step": 2786 }, { "epoch": 2.2031620553359685, "grad_norm": 2.101835250854492, "learning_rate": 5.029710532620754e-07, "loss": 4.1131, "step": 2787 }, { "epoch": 2.2039525691699606, "grad_norm": 2.080902099609375, "learning_rate": 5.020342889470832e-07, "loss": 3.8844, "step": 2788 }, { "epoch": 2.2047430830039527, "grad_norm": 2.187547206878662, "learning_rate": 5.010982224352787e-07, "loss": 4.0613, "step": 2789 }, { "epoch": 2.205533596837945, "grad_norm": 2.0726685523986816, "learning_rate": 5.001628543811855e-07, "loss": 4.0016, "step": 2790 }, { "epoch": 2.206324110671937, "grad_norm": 2.4665539264678955, "learning_rate": 4.992281854388356e-07, "loss": 3.7911, "step": 2791 }, { "epoch": 2.207114624505929, "grad_norm": 2.13879132270813, "learning_rate": 4.982942162617734e-07, "loss": 4.1174, "step": 2792 }, { "epoch": 2.207905138339921, "grad_norm": 2.0213816165924072, "learning_rate": 4.973609475030548e-07, "loss": 4.1675, "step": 2793 }, { "epoch": 2.208695652173913, "grad_norm": 2.0910327434539795, "learning_rate": 4.964283798152448e-07, "loss": 4.0494, "step": 2794 }, { "epoch": 2.2094861660079053, "grad_norm": 2.4352362155914307, "learning_rate": 4.954965138504188e-07, "loss": 3.8835, "step": 2795 }, { "epoch": 2.2102766798418974, "grad_norm": 2.069521903991699, "learning_rate": 4.945653502601615e-07, "loss": 4.1467, "step": 2796 }, { "epoch": 2.2110671936758894, "grad_norm": 2.2626218795776367, "learning_rate": 4.936348896955652e-07, "loss": 3.5416, "step": 2797 }, { "epoch": 2.2118577075098815, "grad_norm": 2.2927327156066895, "learning_rate": 4.927051328072332e-07, "loss": 3.9797, "step": 2798 }, { "epoch": 2.2126482213438736, "grad_norm": 2.0856103897094727, "learning_rate": 4.917760802452754e-07, "loss": 3.9996, "step": 2799 }, { "epoch": 2.2134387351778657, "grad_norm": 2.3482797145843506, "learning_rate": 4.908477326593079e-07, "loss": 3.4238, "step": 2800 }, { "epoch": 2.214229249011858, "grad_norm": 1.9326077699661255, "learning_rate": 4.899200906984557e-07, "loss": 4.2635, "step": 2801 }, { "epoch": 2.21501976284585, "grad_norm": 2.2191696166992188, "learning_rate": 4.889931550113499e-07, "loss": 3.857, "step": 2802 }, { "epoch": 2.215810276679842, "grad_norm": 2.1303060054779053, "learning_rate": 4.880669262461278e-07, "loss": 3.9595, "step": 2803 }, { "epoch": 2.216600790513834, "grad_norm": 1.9961309432983398, "learning_rate": 4.871414050504319e-07, "loss": 4.2089, "step": 2804 }, { "epoch": 2.217391304347826, "grad_norm": 2.207444906234741, "learning_rate": 4.862165920714107e-07, "loss": 3.9471, "step": 2805 }, { "epoch": 2.2181818181818183, "grad_norm": 2.3586783409118652, "learning_rate": 4.852924879557169e-07, "loss": 3.6041, "step": 2806 }, { "epoch": 2.2189723320158103, "grad_norm": 2.1241374015808105, "learning_rate": 4.843690933495079e-07, "loss": 3.8051, "step": 2807 }, { "epoch": 2.2197628458498024, "grad_norm": 2.1025707721710205, "learning_rate": 4.834464088984453e-07, "loss": 4.0925, "step": 2808 }, { "epoch": 2.2205533596837945, "grad_norm": 2.1665055751800537, "learning_rate": 4.825244352476923e-07, "loss": 4.0475, "step": 2809 }, { "epoch": 2.2213438735177866, "grad_norm": 2.3116884231567383, "learning_rate": 4.816031730419177e-07, "loss": 3.9739, "step": 2810 }, { "epoch": 2.2221343873517787, "grad_norm": 2.7119436264038086, "learning_rate": 4.806826229252919e-07, "loss": 4.0432, "step": 2811 }, { "epoch": 2.222924901185771, "grad_norm": 2.5266010761260986, "learning_rate": 4.797627855414862e-07, "loss": 3.702, "step": 2812 }, { "epoch": 2.223715415019763, "grad_norm": 2.108759641647339, "learning_rate": 4.788436615336745e-07, "loss": 3.997, "step": 2813 }, { "epoch": 2.224505928853755, "grad_norm": 4.0953779220581055, "learning_rate": 4.779252515445325e-07, "loss": 3.8699, "step": 2814 }, { "epoch": 2.225296442687747, "grad_norm": 2.102187395095825, "learning_rate": 4.770075562162355e-07, "loss": 4.0148, "step": 2815 }, { "epoch": 2.226086956521739, "grad_norm": 2.3643507957458496, "learning_rate": 4.760905761904598e-07, "loss": 3.4498, "step": 2816 }, { "epoch": 2.2268774703557312, "grad_norm": 2.1839725971221924, "learning_rate": 4.751743121083813e-07, "loss": 3.8668, "step": 2817 }, { "epoch": 2.2276679841897233, "grad_norm": 2.0885751247406006, "learning_rate": 4.7425876461067516e-07, "loss": 4.0606, "step": 2818 }, { "epoch": 2.2284584980237154, "grad_norm": 2.2547173500061035, "learning_rate": 4.733439343375159e-07, "loss": 4.1556, "step": 2819 }, { "epoch": 2.2292490118577075, "grad_norm": 2.1897597312927246, "learning_rate": 4.7242982192857653e-07, "loss": 3.8887, "step": 2820 }, { "epoch": 2.2300395256916996, "grad_norm": 2.1496667861938477, "learning_rate": 4.7151642802302695e-07, "loss": 3.8214, "step": 2821 }, { "epoch": 2.2308300395256917, "grad_norm": 2.1956729888916016, "learning_rate": 4.706037532595363e-07, "loss": 4.1544, "step": 2822 }, { "epoch": 2.231620553359684, "grad_norm": 2.0981390476226807, "learning_rate": 4.696917982762697e-07, "loss": 4.0834, "step": 2823 }, { "epoch": 2.232411067193676, "grad_norm": 2.046031951904297, "learning_rate": 4.6878056371088984e-07, "loss": 4.3017, "step": 2824 }, { "epoch": 2.233201581027668, "grad_norm": 2.041597366333008, "learning_rate": 4.678700502005551e-07, "loss": 4.0341, "step": 2825 }, { "epoch": 2.23399209486166, "grad_norm": 2.036477565765381, "learning_rate": 4.6696025838192e-07, "loss": 4.26, "step": 2826 }, { "epoch": 2.234782608695652, "grad_norm": 2.5720856189727783, "learning_rate": 4.66051188891134e-07, "loss": 3.1488, "step": 2827 }, { "epoch": 2.2355731225296442, "grad_norm": 2.355032444000244, "learning_rate": 4.6514284236384245e-07, "loss": 3.7171, "step": 2828 }, { "epoch": 2.2363636363636363, "grad_norm": 2.2888827323913574, "learning_rate": 4.6423521943518345e-07, "loss": 3.9302, "step": 2829 }, { "epoch": 2.2371541501976284, "grad_norm": 2.2139780521392822, "learning_rate": 4.6332832073979027e-07, "loss": 3.9452, "step": 2830 }, { "epoch": 2.2379446640316205, "grad_norm": 2.0250654220581055, "learning_rate": 4.624221469117912e-07, "loss": 4.1014, "step": 2831 }, { "epoch": 2.2387351778656126, "grad_norm": 2.7324254512786865, "learning_rate": 4.6151669858480473e-07, "loss": 3.8893, "step": 2832 }, { "epoch": 2.2395256916996047, "grad_norm": 1.8589943647384644, "learning_rate": 4.6061197639194376e-07, "loss": 4.3707, "step": 2833 }, { "epoch": 2.240316205533597, "grad_norm": 1.9853399991989136, "learning_rate": 4.5970798096581355e-07, "loss": 4.3274, "step": 2834 }, { "epoch": 2.241106719367589, "grad_norm": 2.094752073287964, "learning_rate": 4.588047129385107e-07, "loss": 4.0653, "step": 2835 }, { "epoch": 2.241897233201581, "grad_norm": 2.1862261295318604, "learning_rate": 4.5790217294162353e-07, "loss": 4.0527, "step": 2836 }, { "epoch": 2.242687747035573, "grad_norm": 2.0378034114837646, "learning_rate": 4.570003616062313e-07, "loss": 4.0831, "step": 2837 }, { "epoch": 2.243478260869565, "grad_norm": 2.4056344032287598, "learning_rate": 4.560992795629024e-07, "loss": 3.6132, "step": 2838 }, { "epoch": 2.2442687747035572, "grad_norm": 2.0315210819244385, "learning_rate": 4.5519892744169785e-07, "loss": 4.1351, "step": 2839 }, { "epoch": 2.2450592885375493, "grad_norm": 2.2692065238952637, "learning_rate": 4.542993058721669e-07, "loss": 3.6617, "step": 2840 }, { "epoch": 2.2458498023715414, "grad_norm": 2.210923671722412, "learning_rate": 4.5340041548334696e-07, "loss": 4.0045, "step": 2841 }, { "epoch": 2.2466403162055335, "grad_norm": 2.0106325149536133, "learning_rate": 4.5250225690376535e-07, "loss": 4.0394, "step": 2842 }, { "epoch": 2.2474308300395256, "grad_norm": 2.0728955268859863, "learning_rate": 4.5160483076143894e-07, "loss": 4.102, "step": 2843 }, { "epoch": 2.2482213438735177, "grad_norm": 2.1525638103485107, "learning_rate": 4.507081376838696e-07, "loss": 4.17, "step": 2844 }, { "epoch": 2.2490118577075098, "grad_norm": 2.192519187927246, "learning_rate": 4.498121782980486e-07, "loss": 3.9752, "step": 2845 }, { "epoch": 2.249802371541502, "grad_norm": 2.257361650466919, "learning_rate": 4.4891695323045367e-07, "loss": 3.7212, "step": 2846 }, { "epoch": 2.250592885375494, "grad_norm": 2.015843152999878, "learning_rate": 4.4802246310704924e-07, "loss": 3.8982, "step": 2847 }, { "epoch": 2.251383399209486, "grad_norm": 2.0128448009490967, "learning_rate": 4.471287085532855e-07, "loss": 4.0167, "step": 2848 }, { "epoch": 2.252173913043478, "grad_norm": 2.0829849243164062, "learning_rate": 4.462356901940993e-07, "loss": 4.115, "step": 2849 }, { "epoch": 2.2529644268774702, "grad_norm": 2.0705034732818604, "learning_rate": 4.453434086539103e-07, "loss": 3.9903, "step": 2850 }, { "epoch": 2.2537549407114623, "grad_norm": 2.0430831909179688, "learning_rate": 4.4445186455662646e-07, "loss": 4.328, "step": 2851 }, { "epoch": 2.2545454545454544, "grad_norm": 2.049219846725464, "learning_rate": 4.43561058525637e-07, "loss": 4.0981, "step": 2852 }, { "epoch": 2.2553359683794465, "grad_norm": 2.1836659908294678, "learning_rate": 4.426709911838167e-07, "loss": 3.9768, "step": 2853 }, { "epoch": 2.2561264822134386, "grad_norm": 2.0280909538269043, "learning_rate": 4.417816631535236e-07, "loss": 4.2743, "step": 2854 }, { "epoch": 2.2569169960474307, "grad_norm": 2.039971113204956, "learning_rate": 4.408930750565984e-07, "loss": 4.0838, "step": 2855 }, { "epoch": 2.2577075098814228, "grad_norm": 2.296727180480957, "learning_rate": 4.400052275143647e-07, "loss": 3.8539, "step": 2856 }, { "epoch": 2.258498023715415, "grad_norm": 2.1944029331207275, "learning_rate": 4.3911812114762873e-07, "loss": 4.0496, "step": 2857 }, { "epoch": 2.259288537549407, "grad_norm": 2.201347827911377, "learning_rate": 4.382317565766767e-07, "loss": 3.5598, "step": 2858 }, { "epoch": 2.260079051383399, "grad_norm": 2.0293586254119873, "learning_rate": 4.3734613442127886e-07, "loss": 4.1412, "step": 2859 }, { "epoch": 2.260869565217391, "grad_norm": 2.1220948696136475, "learning_rate": 4.364612553006847e-07, "loss": 4.0409, "step": 2860 }, { "epoch": 2.261660079051383, "grad_norm": 2.4069600105285645, "learning_rate": 4.355771198336235e-07, "loss": 3.9154, "step": 2861 }, { "epoch": 2.2624505928853753, "grad_norm": 2.3707447052001953, "learning_rate": 4.34693728638306e-07, "loss": 4.125, "step": 2862 }, { "epoch": 2.2632411067193674, "grad_norm": 1.9692994356155396, "learning_rate": 4.338110823324218e-07, "loss": 4.3131, "step": 2863 }, { "epoch": 2.2640316205533595, "grad_norm": 1.9540678262710571, "learning_rate": 4.329291815331399e-07, "loss": 4.2929, "step": 2864 }, { "epoch": 2.2648221343873516, "grad_norm": 2.1740598678588867, "learning_rate": 4.32048026857108e-07, "loss": 4.0053, "step": 2865 }, { "epoch": 2.265612648221344, "grad_norm": 2.0494470596313477, "learning_rate": 4.3116761892045176e-07, "loss": 4.1962, "step": 2866 }, { "epoch": 2.2664031620553358, "grad_norm": 2.2354979515075684, "learning_rate": 4.3028795833877525e-07, "loss": 3.8747, "step": 2867 }, { "epoch": 2.2671936758893283, "grad_norm": 2.255429267883301, "learning_rate": 4.294090457271595e-07, "loss": 3.8794, "step": 2868 }, { "epoch": 2.26798418972332, "grad_norm": 1.970616340637207, "learning_rate": 4.2853088170016337e-07, "loss": 4.3703, "step": 2869 }, { "epoch": 2.2687747035573125, "grad_norm": 2.0622718334198, "learning_rate": 4.276534668718205e-07, "loss": 4.1169, "step": 2870 }, { "epoch": 2.269565217391304, "grad_norm": 2.372467279434204, "learning_rate": 4.26776801855642e-07, "loss": 3.4773, "step": 2871 }, { "epoch": 2.2703557312252967, "grad_norm": 2.051579713821411, "learning_rate": 4.259008872646157e-07, "loss": 3.8794, "step": 2872 }, { "epoch": 2.2711462450592883, "grad_norm": 2.235138416290283, "learning_rate": 4.250257237112023e-07, "loss": 4.1253, "step": 2873 }, { "epoch": 2.271936758893281, "grad_norm": 2.1186892986297607, "learning_rate": 4.24151311807339e-07, "loss": 4.0846, "step": 2874 }, { "epoch": 2.2727272727272725, "grad_norm": 2.0803868770599365, "learning_rate": 4.232776521644369e-07, "loss": 4.1496, "step": 2875 }, { "epoch": 2.273517786561265, "grad_norm": 2.076878309249878, "learning_rate": 4.2240474539338135e-07, "loss": 4.1618, "step": 2876 }, { "epoch": 2.274308300395257, "grad_norm": 2.255868434906006, "learning_rate": 4.2153259210453093e-07, "loss": 3.7478, "step": 2877 }, { "epoch": 2.275098814229249, "grad_norm": 2.5905354022979736, "learning_rate": 4.206611929077178e-07, "loss": 3.9734, "step": 2878 }, { "epoch": 2.2758893280632413, "grad_norm": 2.276042938232422, "learning_rate": 4.1979054841224644e-07, "loss": 3.8897, "step": 2879 }, { "epoch": 2.2766798418972334, "grad_norm": 2.144022226333618, "learning_rate": 4.189206592268942e-07, "loss": 4.0383, "step": 2880 }, { "epoch": 2.2774703557312255, "grad_norm": 1.9914108514785767, "learning_rate": 4.180515259599092e-07, "loss": 4.1806, "step": 2881 }, { "epoch": 2.2782608695652176, "grad_norm": 2.3276526927948, "learning_rate": 4.17183149219012e-07, "loss": 3.6481, "step": 2882 }, { "epoch": 2.2790513833992097, "grad_norm": 2.170015335083008, "learning_rate": 4.163155296113936e-07, "loss": 3.9956, "step": 2883 }, { "epoch": 2.2798418972332017, "grad_norm": 2.235703468322754, "learning_rate": 4.1544866774371643e-07, "loss": 3.9044, "step": 2884 }, { "epoch": 2.280632411067194, "grad_norm": 2.069786787033081, "learning_rate": 4.14582564222112e-07, "loss": 3.9203, "step": 2885 }, { "epoch": 2.281422924901186, "grad_norm": 2.1164820194244385, "learning_rate": 4.1371721965218223e-07, "loss": 3.9955, "step": 2886 }, { "epoch": 2.282213438735178, "grad_norm": 2.223440408706665, "learning_rate": 4.128526346389983e-07, "loss": 3.7437, "step": 2887 }, { "epoch": 2.28300395256917, "grad_norm": 3.8898892402648926, "learning_rate": 4.119888097871001e-07, "loss": 3.8113, "step": 2888 }, { "epoch": 2.283794466403162, "grad_norm": 1.9932819604873657, "learning_rate": 4.111257457004967e-07, "loss": 4.1891, "step": 2889 }, { "epoch": 2.2845849802371543, "grad_norm": 2.2857184410095215, "learning_rate": 4.1026344298266364e-07, "loss": 3.9727, "step": 2890 }, { "epoch": 2.2853754940711464, "grad_norm": 2.205775737762451, "learning_rate": 4.094019022365448e-07, "loss": 3.7644, "step": 2891 }, { "epoch": 2.2861660079051385, "grad_norm": 2.445854425430298, "learning_rate": 4.0854112406455337e-07, "loss": 3.4518, "step": 2892 }, { "epoch": 2.2869565217391306, "grad_norm": 2.116027593612671, "learning_rate": 4.0768110906856574e-07, "loss": 3.9771, "step": 2893 }, { "epoch": 2.2877470355731226, "grad_norm": 2.1361186504364014, "learning_rate": 4.0682185784992705e-07, "loss": 4.1396, "step": 2894 }, { "epoch": 2.2885375494071147, "grad_norm": 1.8076856136322021, "learning_rate": 4.0596337100944793e-07, "loss": 4.3055, "step": 2895 }, { "epoch": 2.289328063241107, "grad_norm": 2.016937732696533, "learning_rate": 4.051056491474039e-07, "loss": 4.1926, "step": 2896 }, { "epoch": 2.290118577075099, "grad_norm": 2.023179769515991, "learning_rate": 4.0424869286353654e-07, "loss": 4.1018, "step": 2897 }, { "epoch": 2.290909090909091, "grad_norm": 2.0541515350341797, "learning_rate": 4.033925027570518e-07, "loss": 3.8427, "step": 2898 }, { "epoch": 2.291699604743083, "grad_norm": 2.055370807647705, "learning_rate": 4.0253707942661846e-07, "loss": 4.0884, "step": 2899 }, { "epoch": 2.292490118577075, "grad_norm": 2.1308038234710693, "learning_rate": 4.0168242347037183e-07, "loss": 4.0608, "step": 2900 }, { "epoch": 2.2932806324110673, "grad_norm": 2.1740894317626953, "learning_rate": 4.008285354859092e-07, "loss": 3.8922, "step": 2901 }, { "epoch": 2.2940711462450594, "grad_norm": 1.992264747619629, "learning_rate": 3.9997541607029014e-07, "loss": 4.0153, "step": 2902 }, { "epoch": 2.2948616600790515, "grad_norm": 2.401360273361206, "learning_rate": 3.991230658200373e-07, "loss": 3.6878, "step": 2903 }, { "epoch": 2.2956521739130435, "grad_norm": 2.0959386825561523, "learning_rate": 3.9827148533113733e-07, "loss": 3.8393, "step": 2904 }, { "epoch": 2.2964426877470356, "grad_norm": 2.2453131675720215, "learning_rate": 3.9742067519903584e-07, "loss": 3.73, "step": 2905 }, { "epoch": 2.2972332015810277, "grad_norm": 2.092893600463867, "learning_rate": 3.9657063601864154e-07, "loss": 4.1129, "step": 2906 }, { "epoch": 2.29802371541502, "grad_norm": 2.240833282470703, "learning_rate": 3.9572136838432356e-07, "loss": 3.9244, "step": 2907 }, { "epoch": 2.298814229249012, "grad_norm": 2.1221518516540527, "learning_rate": 3.948728728899114e-07, "loss": 4.0498, "step": 2908 }, { "epoch": 2.299604743083004, "grad_norm": 2.1471176147460938, "learning_rate": 3.9402515012869503e-07, "loss": 4.0365, "step": 2909 }, { "epoch": 2.300395256916996, "grad_norm": 1.9516586065292358, "learning_rate": 3.9317820069342433e-07, "loss": 4.1052, "step": 2910 }, { "epoch": 2.301185770750988, "grad_norm": 2.266040802001953, "learning_rate": 3.923320251763068e-07, "loss": 3.8973, "step": 2911 }, { "epoch": 2.3019762845849803, "grad_norm": 2.311701536178589, "learning_rate": 3.9148662416901156e-07, "loss": 3.7728, "step": 2912 }, { "epoch": 2.3027667984189724, "grad_norm": 2.477179527282715, "learning_rate": 3.9064199826266354e-07, "loss": 4.0096, "step": 2913 }, { "epoch": 2.3035573122529645, "grad_norm": 1.9477367401123047, "learning_rate": 3.8979814804784726e-07, "loss": 4.2837, "step": 2914 }, { "epoch": 2.3043478260869565, "grad_norm": 2.2885448932647705, "learning_rate": 3.889550741146041e-07, "loss": 3.5183, "step": 2915 }, { "epoch": 2.3051383399209486, "grad_norm": 2.2236034870147705, "learning_rate": 3.8811277705243347e-07, "loss": 3.7846, "step": 2916 }, { "epoch": 2.3059288537549407, "grad_norm": 2.427159547805786, "learning_rate": 3.872712574502905e-07, "loss": 3.677, "step": 2917 }, { "epoch": 2.306719367588933, "grad_norm": 2.110365390777588, "learning_rate": 3.8643051589658825e-07, "loss": 3.946, "step": 2918 }, { "epoch": 2.307509881422925, "grad_norm": 2.1771254539489746, "learning_rate": 3.855905529791929e-07, "loss": 4.073, "step": 2919 }, { "epoch": 2.308300395256917, "grad_norm": 2.080944776535034, "learning_rate": 3.8475136928542966e-07, "loss": 4.4277, "step": 2920 }, { "epoch": 2.309090909090909, "grad_norm": 2.0010173320770264, "learning_rate": 3.839129654020772e-07, "loss": 4.401, "step": 2921 }, { "epoch": 2.309881422924901, "grad_norm": 2.383734941482544, "learning_rate": 3.830753419153681e-07, "loss": 3.9465, "step": 2922 }, { "epoch": 2.3106719367588933, "grad_norm": 1.9999767541885376, "learning_rate": 3.8223849941099057e-07, "loss": 4.0939, "step": 2923 }, { "epoch": 2.3114624505928854, "grad_norm": 2.001502752304077, "learning_rate": 3.814024384740865e-07, "loss": 4.0563, "step": 2924 }, { "epoch": 2.3122529644268774, "grad_norm": 1.9762526750564575, "learning_rate": 3.805671596892509e-07, "loss": 4.317, "step": 2925 }, { "epoch": 2.3130434782608695, "grad_norm": 2.1148085594177246, "learning_rate": 3.797326636405322e-07, "loss": 4.1966, "step": 2926 }, { "epoch": 2.3138339920948616, "grad_norm": 2.1871073246002197, "learning_rate": 3.788989509114312e-07, "loss": 4.1489, "step": 2927 }, { "epoch": 2.3146245059288537, "grad_norm": 2.170100450515747, "learning_rate": 3.780660220849017e-07, "loss": 4.0451, "step": 2928 }, { "epoch": 2.315415019762846, "grad_norm": 2.1618804931640625, "learning_rate": 3.772338777433482e-07, "loss": 3.7617, "step": 2929 }, { "epoch": 2.316205533596838, "grad_norm": 2.378645896911621, "learning_rate": 3.764025184686283e-07, "loss": 4.038, "step": 2930 }, { "epoch": 2.31699604743083, "grad_norm": 2.1028895378112793, "learning_rate": 3.755719448420488e-07, "loss": 4.1407, "step": 2931 }, { "epoch": 2.317786561264822, "grad_norm": 2.149047613143921, "learning_rate": 3.7474215744436764e-07, "loss": 4.0351, "step": 2932 }, { "epoch": 2.318577075098814, "grad_norm": 1.985180139541626, "learning_rate": 3.7391315685579514e-07, "loss": 4.2184, "step": 2933 }, { "epoch": 2.3193675889328063, "grad_norm": 2.064873695373535, "learning_rate": 3.730849436559883e-07, "loss": 3.9753, "step": 2934 }, { "epoch": 2.3201581027667983, "grad_norm": 2.26452898979187, "learning_rate": 3.722575184240555e-07, "loss": 3.9874, "step": 2935 }, { "epoch": 2.3209486166007904, "grad_norm": 2.053659200668335, "learning_rate": 3.7143088173855344e-07, "loss": 4.1151, "step": 2936 }, { "epoch": 2.3217391304347825, "grad_norm": 2.0565409660339355, "learning_rate": 3.70605034177488e-07, "loss": 4.173, "step": 2937 }, { "epoch": 2.3225296442687746, "grad_norm": 2.133837938308716, "learning_rate": 3.697799763183127e-07, "loss": 4.3095, "step": 2938 }, { "epoch": 2.3233201581027667, "grad_norm": 2.14797306060791, "learning_rate": 3.6895570873792953e-07, "loss": 4.0225, "step": 2939 }, { "epoch": 2.324110671936759, "grad_norm": 2.1039600372314453, "learning_rate": 3.681322320126865e-07, "loss": 3.7894, "step": 2940 }, { "epoch": 2.324901185770751, "grad_norm": 2.2229409217834473, "learning_rate": 3.67309546718381e-07, "loss": 3.7099, "step": 2941 }, { "epoch": 2.325691699604743, "grad_norm": 2.494149923324585, "learning_rate": 3.664876534302545e-07, "loss": 3.6109, "step": 2942 }, { "epoch": 2.326482213438735, "grad_norm": 2.2613799571990967, "learning_rate": 3.6566655272299646e-07, "loss": 4.007, "step": 2943 }, { "epoch": 2.327272727272727, "grad_norm": 2.0924978256225586, "learning_rate": 3.648462451707413e-07, "loss": 4.1851, "step": 2944 }, { "epoch": 2.3280632411067192, "grad_norm": 2.189425230026245, "learning_rate": 3.640267313470691e-07, "loss": 4.1091, "step": 2945 }, { "epoch": 2.3288537549407113, "grad_norm": 2.0420405864715576, "learning_rate": 3.632080118250051e-07, "loss": 4.067, "step": 2946 }, { "epoch": 2.3296442687747034, "grad_norm": 2.1415469646453857, "learning_rate": 3.623900871770186e-07, "loss": 4.0166, "step": 2947 }, { "epoch": 2.3304347826086955, "grad_norm": 2.1348538398742676, "learning_rate": 3.6157295797502387e-07, "loss": 3.7823, "step": 2948 }, { "epoch": 2.3312252964426876, "grad_norm": 2.0963938236236572, "learning_rate": 3.6075662479037835e-07, "loss": 3.7315, "step": 2949 }, { "epoch": 2.3320158102766797, "grad_norm": 2.0032851696014404, "learning_rate": 3.599410881938834e-07, "loss": 4.0529, "step": 2950 }, { "epoch": 2.332806324110672, "grad_norm": 2.1452407836914062, "learning_rate": 3.5912634875578267e-07, "loss": 4.0727, "step": 2951 }, { "epoch": 2.333596837944664, "grad_norm": 2.270573139190674, "learning_rate": 3.5831240704576263e-07, "loss": 3.8966, "step": 2952 }, { "epoch": 2.334387351778656, "grad_norm": 2.115280866622925, "learning_rate": 3.5749926363295325e-07, "loss": 3.9803, "step": 2953 }, { "epoch": 2.335177865612648, "grad_norm": 2.272212266921997, "learning_rate": 3.566869190859243e-07, "loss": 3.6275, "step": 2954 }, { "epoch": 2.33596837944664, "grad_norm": 2.2331008911132812, "learning_rate": 3.5587537397268793e-07, "loss": 4.007, "step": 2955 }, { "epoch": 2.3367588932806322, "grad_norm": 2.2201616764068604, "learning_rate": 3.5506462886069766e-07, "loss": 3.8519, "step": 2956 }, { "epoch": 2.3375494071146243, "grad_norm": 2.241945266723633, "learning_rate": 3.5425468431684706e-07, "loss": 3.9263, "step": 2957 }, { "epoch": 2.3383399209486164, "grad_norm": 2.015005111694336, "learning_rate": 3.534455409074702e-07, "loss": 3.8032, "step": 2958 }, { "epoch": 2.3391304347826085, "grad_norm": 2.2675118446350098, "learning_rate": 3.52637199198341e-07, "loss": 4.0501, "step": 2959 }, { "epoch": 2.3399209486166006, "grad_norm": 2.0161828994750977, "learning_rate": 3.5182965975467187e-07, "loss": 4.2517, "step": 2960 }, { "epoch": 2.3407114624505927, "grad_norm": 2.1055612564086914, "learning_rate": 3.5102292314111595e-07, "loss": 4.0468, "step": 2961 }, { "epoch": 2.341501976284585, "grad_norm": 2.880842685699463, "learning_rate": 3.502169899217644e-07, "loss": 3.7572, "step": 2962 }, { "epoch": 2.3422924901185773, "grad_norm": 2.224605083465576, "learning_rate": 3.4941186066014537e-07, "loss": 3.536, "step": 2963 }, { "epoch": 2.343083003952569, "grad_norm": 2.245774507522583, "learning_rate": 3.4860753591922604e-07, "loss": 3.8004, "step": 2964 }, { "epoch": 2.3438735177865615, "grad_norm": 2.782724618911743, "learning_rate": 3.4780401626141193e-07, "loss": 4.0214, "step": 2965 }, { "epoch": 2.344664031620553, "grad_norm": 2.3959059715270996, "learning_rate": 3.4700130224854343e-07, "loss": 3.7057, "step": 2966 }, { "epoch": 2.3454545454545457, "grad_norm": 2.213465452194214, "learning_rate": 3.4619939444189904e-07, "loss": 4.0238, "step": 2967 }, { "epoch": 2.3462450592885373, "grad_norm": 1.9595872163772583, "learning_rate": 3.4539829340219354e-07, "loss": 4.3551, "step": 2968 }, { "epoch": 2.34703557312253, "grad_norm": 1.897900104522705, "learning_rate": 3.445979996895769e-07, "loss": 4.3927, "step": 2969 }, { "epoch": 2.3478260869565215, "grad_norm": 2.1092593669891357, "learning_rate": 3.4379851386363564e-07, "loss": 3.9559, "step": 2970 }, { "epoch": 2.348616600790514, "grad_norm": 2.011162757873535, "learning_rate": 3.429998364833899e-07, "loss": 4.235, "step": 2971 }, { "epoch": 2.3494071146245057, "grad_norm": 2.3276305198669434, "learning_rate": 3.422019681072953e-07, "loss": 3.7452, "step": 2972 }, { "epoch": 2.3501976284584982, "grad_norm": 2.2966203689575195, "learning_rate": 3.4140490929324296e-07, "loss": 4.2523, "step": 2973 }, { "epoch": 2.3509881422924903, "grad_norm": 2.0543150901794434, "learning_rate": 3.406086605985559e-07, "loss": 3.9759, "step": 2974 }, { "epoch": 2.3517786561264824, "grad_norm": 2.03900408744812, "learning_rate": 3.3981322257999163e-07, "loss": 4.4115, "step": 2975 }, { "epoch": 2.3525691699604745, "grad_norm": 2.0474133491516113, "learning_rate": 3.390185957937412e-07, "loss": 4.0614, "step": 2976 }, { "epoch": 2.3533596837944666, "grad_norm": 2.196131467819214, "learning_rate": 3.382247807954277e-07, "loss": 3.3531, "step": 2977 }, { "epoch": 2.3541501976284587, "grad_norm": 2.3787825107574463, "learning_rate": 3.37431778140107e-07, "loss": 3.9254, "step": 2978 }, { "epoch": 2.3549407114624508, "grad_norm": 2.346285820007324, "learning_rate": 3.366395883822674e-07, "loss": 3.7092, "step": 2979 }, { "epoch": 2.355731225296443, "grad_norm": 2.0495083332061768, "learning_rate": 3.358482120758269e-07, "loss": 4.0277, "step": 2980 }, { "epoch": 2.356521739130435, "grad_norm": 2.058990955352783, "learning_rate": 3.350576497741374e-07, "loss": 4.1089, "step": 2981 }, { "epoch": 2.357312252964427, "grad_norm": 2.420837640762329, "learning_rate": 3.342679020299801e-07, "loss": 3.9909, "step": 2982 }, { "epoch": 2.358102766798419, "grad_norm": 2.2312517166137695, "learning_rate": 3.334789693955663e-07, "loss": 3.7763, "step": 2983 }, { "epoch": 2.358893280632411, "grad_norm": 2.103039503097534, "learning_rate": 3.3269085242253817e-07, "loss": 4.3001, "step": 2984 }, { "epoch": 2.3596837944664033, "grad_norm": 2.112126588821411, "learning_rate": 3.3190355166196744e-07, "loss": 3.8412, "step": 2985 }, { "epoch": 2.3604743083003954, "grad_norm": 2.2801010608673096, "learning_rate": 3.311170676643547e-07, "loss": 4.1319, "step": 2986 }, { "epoch": 2.3612648221343875, "grad_norm": 2.0498340129852295, "learning_rate": 3.3033140097962994e-07, "loss": 4.1087, "step": 2987 }, { "epoch": 2.3620553359683796, "grad_norm": 2.1894519329071045, "learning_rate": 3.295465521571515e-07, "loss": 3.9055, "step": 2988 }, { "epoch": 2.3628458498023717, "grad_norm": 2.1521778106689453, "learning_rate": 3.2876252174570544e-07, "loss": 4.1449, "step": 2989 }, { "epoch": 2.3636363636363638, "grad_norm": 2.3060050010681152, "learning_rate": 3.279793102935061e-07, "loss": 3.786, "step": 2990 }, { "epoch": 2.364426877470356, "grad_norm": 2.106206178665161, "learning_rate": 3.2719691834819556e-07, "loss": 4.1179, "step": 2991 }, { "epoch": 2.365217391304348, "grad_norm": 2.3109827041625977, "learning_rate": 3.2641534645684117e-07, "loss": 3.9127, "step": 2992 }, { "epoch": 2.36600790513834, "grad_norm": 2.0890181064605713, "learning_rate": 3.2563459516593823e-07, "loss": 4.0199, "step": 2993 }, { "epoch": 2.366798418972332, "grad_norm": 2.042954444885254, "learning_rate": 3.248546650214094e-07, "loss": 4.0505, "step": 2994 }, { "epoch": 2.367588932806324, "grad_norm": 2.223935842514038, "learning_rate": 3.2407555656860014e-07, "loss": 3.725, "step": 2995 }, { "epoch": 2.3683794466403163, "grad_norm": 2.103994131088257, "learning_rate": 3.2329727035228385e-07, "loss": 4.0731, "step": 2996 }, { "epoch": 2.3691699604743084, "grad_norm": 2.3290605545043945, "learning_rate": 3.225198069166581e-07, "loss": 3.3858, "step": 2997 }, { "epoch": 2.3699604743083005, "grad_norm": 2.0892598628997803, "learning_rate": 3.2174316680534513e-07, "loss": 4.1255, "step": 2998 }, { "epoch": 2.3707509881422926, "grad_norm": 2.062631607055664, "learning_rate": 3.209673505613916e-07, "loss": 4.0918, "step": 2999 }, { "epoch": 2.3715415019762847, "grad_norm": 2.1472373008728027, "learning_rate": 3.2019235872726867e-07, "loss": 4.0413, "step": 3000 }, { "epoch": 2.3715415019762847, "eval_loss": 3.937901735305786, "eval_runtime": 3.8623, "eval_samples_per_second": 776.739, "eval_steps_per_second": 3.366, "step": 3000 }, { "epoch": 2.3723320158102768, "grad_norm": 2.2331326007843018, "learning_rate": 3.1941819184486914e-07, "loss": 3.9376, "step": 3001 }, { "epoch": 2.373122529644269, "grad_norm": 2.184760808944702, "learning_rate": 3.1864485045551205e-07, "loss": 4.1612, "step": 3002 }, { "epoch": 2.373913043478261, "grad_norm": 2.209885358810425, "learning_rate": 3.178723350999362e-07, "loss": 4.1464, "step": 3003 }, { "epoch": 2.374703557312253, "grad_norm": 2.1984665393829346, "learning_rate": 3.171006463183044e-07, "loss": 4.0399, "step": 3004 }, { "epoch": 2.375494071146245, "grad_norm": 2.1225926876068115, "learning_rate": 3.163297846502013e-07, "loss": 3.984, "step": 3005 }, { "epoch": 2.376284584980237, "grad_norm": 2.1693239212036133, "learning_rate": 3.1555975063463345e-07, "loss": 3.8512, "step": 3006 }, { "epoch": 2.3770750988142293, "grad_norm": 2.0618746280670166, "learning_rate": 3.147905448100278e-07, "loss": 3.9106, "step": 3007 }, { "epoch": 2.3778656126482214, "grad_norm": 2.278391122817993, "learning_rate": 3.1402216771423367e-07, "loss": 3.8598, "step": 3008 }, { "epoch": 2.3786561264822135, "grad_norm": 2.143280506134033, "learning_rate": 3.132546198845186e-07, "loss": 4.1399, "step": 3009 }, { "epoch": 2.3794466403162056, "grad_norm": 2.217982530593872, "learning_rate": 3.124879018575727e-07, "loss": 3.9533, "step": 3010 }, { "epoch": 2.3802371541501977, "grad_norm": 2.156477689743042, "learning_rate": 3.1172201416950515e-07, "loss": 3.7857, "step": 3011 }, { "epoch": 2.3810276679841897, "grad_norm": 2.0283751487731934, "learning_rate": 3.1095695735584336e-07, "loss": 4.0188, "step": 3012 }, { "epoch": 2.381818181818182, "grad_norm": 2.34765887260437, "learning_rate": 3.1019273195153443e-07, "loss": 3.7802, "step": 3013 }, { "epoch": 2.382608695652174, "grad_norm": 2.131340980529785, "learning_rate": 3.0942933849094593e-07, "loss": 3.8413, "step": 3014 }, { "epoch": 2.383399209486166, "grad_norm": 2.1491878032684326, "learning_rate": 3.0866677750786066e-07, "loss": 3.9509, "step": 3015 }, { "epoch": 2.384189723320158, "grad_norm": 2.3785905838012695, "learning_rate": 3.0790504953548115e-07, "loss": 3.5871, "step": 3016 }, { "epoch": 2.38498023715415, "grad_norm": 2.059063196182251, "learning_rate": 3.0714415510642733e-07, "loss": 4.043, "step": 3017 }, { "epoch": 2.3857707509881423, "grad_norm": 2.257765293121338, "learning_rate": 3.0638409475273584e-07, "loss": 4.0851, "step": 3018 }, { "epoch": 2.3865612648221344, "grad_norm": 2.33659291267395, "learning_rate": 3.056248690058606e-07, "loss": 3.7775, "step": 3019 }, { "epoch": 2.3873517786561265, "grad_norm": 2.4271464347839355, "learning_rate": 3.048664783966719e-07, "loss": 3.6814, "step": 3020 }, { "epoch": 2.3881422924901186, "grad_norm": 1.9375990629196167, "learning_rate": 3.041089234554547e-07, "loss": 4.2213, "step": 3021 }, { "epoch": 2.3889328063241106, "grad_norm": 2.570812463760376, "learning_rate": 3.033522047119121e-07, "loss": 3.8155, "step": 3022 }, { "epoch": 2.3897233201581027, "grad_norm": 2.008787155151367, "learning_rate": 3.025963226951612e-07, "loss": 4.1795, "step": 3023 }, { "epoch": 2.390513833992095, "grad_norm": 2.14224910736084, "learning_rate": 3.018412779337333e-07, "loss": 3.9053, "step": 3024 }, { "epoch": 2.391304347826087, "grad_norm": 2.2142157554626465, "learning_rate": 3.010870709555754e-07, "loss": 3.8069, "step": 3025 }, { "epoch": 2.392094861660079, "grad_norm": 2.173997163772583, "learning_rate": 3.0033370228804824e-07, "loss": 3.901, "step": 3026 }, { "epoch": 2.392885375494071, "grad_norm": 2.514061212539673, "learning_rate": 2.9958117245792677e-07, "loss": 3.9441, "step": 3027 }, { "epoch": 2.393675889328063, "grad_norm": 2.0673880577087402, "learning_rate": 2.9882948199139876e-07, "loss": 4.0271, "step": 3028 }, { "epoch": 2.3944664031620553, "grad_norm": 2.233372926712036, "learning_rate": 2.9807863141406575e-07, "loss": 4.0954, "step": 3029 }, { "epoch": 2.3952569169960474, "grad_norm": 2.3688600063323975, "learning_rate": 2.973286212509415e-07, "loss": 3.7012, "step": 3030 }, { "epoch": 2.3960474308300395, "grad_norm": 2.2394518852233887, "learning_rate": 2.9657945202645284e-07, "loss": 3.8151, "step": 3031 }, { "epoch": 2.3968379446640315, "grad_norm": 2.290454149246216, "learning_rate": 2.958311242644373e-07, "loss": 4.1159, "step": 3032 }, { "epoch": 2.3976284584980236, "grad_norm": 2.1373836994171143, "learning_rate": 2.9508363848814465e-07, "loss": 4.1522, "step": 3033 }, { "epoch": 2.3984189723320157, "grad_norm": 2.132472515106201, "learning_rate": 2.943369952202375e-07, "loss": 4.1368, "step": 3034 }, { "epoch": 2.399209486166008, "grad_norm": 2.216561794281006, "learning_rate": 2.9359119498278646e-07, "loss": 4.0449, "step": 3035 }, { "epoch": 2.4, "grad_norm": 1.9550436735153198, "learning_rate": 2.9284623829727494e-07, "loss": 4.2703, "step": 3036 }, { "epoch": 2.400790513833992, "grad_norm": 2.226912260055542, "learning_rate": 2.921021256845955e-07, "loss": 4.0941, "step": 3037 }, { "epoch": 2.401581027667984, "grad_norm": 2.3615474700927734, "learning_rate": 2.9135885766505085e-07, "loss": 3.9257, "step": 3038 }, { "epoch": 2.402371541501976, "grad_norm": 2.20588755607605, "learning_rate": 2.9061643475835286e-07, "loss": 3.9847, "step": 3039 }, { "epoch": 2.4031620553359683, "grad_norm": 2.1423146724700928, "learning_rate": 2.898748574836232e-07, "loss": 4.0624, "step": 3040 }, { "epoch": 2.4039525691699604, "grad_norm": 2.0428082942962646, "learning_rate": 2.8913412635939034e-07, "loss": 4.0294, "step": 3041 }, { "epoch": 2.4047430830039525, "grad_norm": 1.871712327003479, "learning_rate": 2.8839424190359387e-07, "loss": 4.1746, "step": 3042 }, { "epoch": 2.4055335968379445, "grad_norm": 2.446239471435547, "learning_rate": 2.876552046335799e-07, "loss": 3.652, "step": 3043 }, { "epoch": 2.4063241106719366, "grad_norm": 1.9390578269958496, "learning_rate": 2.869170150661013e-07, "loss": 4.3244, "step": 3044 }, { "epoch": 2.4071146245059287, "grad_norm": 2.3993794918060303, "learning_rate": 2.861796737173196e-07, "loss": 3.7778, "step": 3045 }, { "epoch": 2.407905138339921, "grad_norm": 2.0375001430511475, "learning_rate": 2.8544318110280285e-07, "loss": 3.9215, "step": 3046 }, { "epoch": 2.408695652173913, "grad_norm": 2.656374931335449, "learning_rate": 2.8470753773752527e-07, "loss": 3.5467, "step": 3047 }, { "epoch": 2.409486166007905, "grad_norm": 2.5004167556762695, "learning_rate": 2.8397274413586787e-07, "loss": 4.064, "step": 3048 }, { "epoch": 2.410276679841897, "grad_norm": 2.2749557495117188, "learning_rate": 2.8323880081161694e-07, "loss": 3.6961, "step": 3049 }, { "epoch": 2.411067193675889, "grad_norm": 2.202690839767456, "learning_rate": 2.825057082779645e-07, "loss": 3.9251, "step": 3050 }, { "epoch": 2.4118577075098813, "grad_norm": 2.1747875213623047, "learning_rate": 2.817734670475077e-07, "loss": 4.0221, "step": 3051 }, { "epoch": 2.4126482213438734, "grad_norm": 2.127263307571411, "learning_rate": 2.810420776322487e-07, "loss": 4.0569, "step": 3052 }, { "epoch": 2.4134387351778654, "grad_norm": 2.834138870239258, "learning_rate": 2.8031154054359306e-07, "loss": 4.0877, "step": 3053 }, { "epoch": 2.4142292490118575, "grad_norm": 2.087924003601074, "learning_rate": 2.795818562923507e-07, "loss": 4.0486, "step": 3054 }, { "epoch": 2.4150197628458496, "grad_norm": 2.221693754196167, "learning_rate": 2.7885302538873714e-07, "loss": 4.0105, "step": 3055 }, { "epoch": 2.4158102766798417, "grad_norm": 2.2069010734558105, "learning_rate": 2.781250483423681e-07, "loss": 3.5298, "step": 3056 }, { "epoch": 2.416600790513834, "grad_norm": 2.0568366050720215, "learning_rate": 2.7739792566226433e-07, "loss": 4.2179, "step": 3057 }, { "epoch": 2.417391304347826, "grad_norm": 2.2814128398895264, "learning_rate": 2.766716578568486e-07, "loss": 4.145, "step": 3058 }, { "epoch": 2.418181818181818, "grad_norm": 2.349754810333252, "learning_rate": 2.75946245433946e-07, "loss": 3.6432, "step": 3059 }, { "epoch": 2.4189723320158105, "grad_norm": 2.0276806354522705, "learning_rate": 2.7522168890078384e-07, "loss": 4.0994, "step": 3060 }, { "epoch": 2.419762845849802, "grad_norm": 2.0612144470214844, "learning_rate": 2.7449798876398966e-07, "loss": 4.1753, "step": 3061 }, { "epoch": 2.4205533596837947, "grad_norm": 2.2982242107391357, "learning_rate": 2.7377514552959356e-07, "loss": 3.9749, "step": 3062 }, { "epoch": 2.4213438735177863, "grad_norm": 2.2536916732788086, "learning_rate": 2.730531597030268e-07, "loss": 3.8516, "step": 3063 }, { "epoch": 2.422134387351779, "grad_norm": 1.9575066566467285, "learning_rate": 2.723320317891193e-07, "loss": 4.2995, "step": 3064 }, { "epoch": 2.4229249011857705, "grad_norm": 2.385629415512085, "learning_rate": 2.7161176229210256e-07, "loss": 3.7368, "step": 3065 }, { "epoch": 2.423715415019763, "grad_norm": 2.083620071411133, "learning_rate": 2.7089235171560765e-07, "loss": 4.0991, "step": 3066 }, { "epoch": 2.4245059288537547, "grad_norm": 2.1057701110839844, "learning_rate": 2.701738005626646e-07, "loss": 3.943, "step": 3067 }, { "epoch": 2.4252964426877472, "grad_norm": 2.2508459091186523, "learning_rate": 2.69456109335703e-07, "loss": 3.9364, "step": 3068 }, { "epoch": 2.426086956521739, "grad_norm": 2.0980405807495117, "learning_rate": 2.687392785365513e-07, "loss": 4.2003, "step": 3069 }, { "epoch": 2.4268774703557314, "grad_norm": 1.9972642660140991, "learning_rate": 2.680233086664346e-07, "loss": 4.2626, "step": 3070 }, { "epoch": 2.427667984189723, "grad_norm": 2.2711315155029297, "learning_rate": 2.673082002259788e-07, "loss": 3.8608, "step": 3071 }, { "epoch": 2.4284584980237156, "grad_norm": 2.2515268325805664, "learning_rate": 2.6659395371520577e-07, "loss": 4.0174, "step": 3072 }, { "epoch": 2.4292490118577077, "grad_norm": 2.102890968322754, "learning_rate": 2.658805696335344e-07, "loss": 3.9877, "step": 3073 }, { "epoch": 2.4300395256917, "grad_norm": 2.3459665775299072, "learning_rate": 2.65168048479781e-07, "loss": 3.8612, "step": 3074 }, { "epoch": 2.430830039525692, "grad_norm": 2.159860610961914, "learning_rate": 2.644563907521598e-07, "loss": 3.762, "step": 3075 }, { "epoch": 2.431620553359684, "grad_norm": 2.051974296569824, "learning_rate": 2.6374559694827904e-07, "loss": 4.1662, "step": 3076 }, { "epoch": 2.432411067193676, "grad_norm": 2.062122106552124, "learning_rate": 2.6303566756514426e-07, "loss": 3.9169, "step": 3077 }, { "epoch": 2.433201581027668, "grad_norm": 1.9586265087127686, "learning_rate": 2.6232660309915633e-07, "loss": 4.2703, "step": 3078 }, { "epoch": 2.4339920948616602, "grad_norm": 2.0164132118225098, "learning_rate": 2.616184040461111e-07, "loss": 4.202, "step": 3079 }, { "epoch": 2.4347826086956523, "grad_norm": 2.1918535232543945, "learning_rate": 2.609110709011997e-07, "loss": 3.8275, "step": 3080 }, { "epoch": 2.4355731225296444, "grad_norm": 2.96417498588562, "learning_rate": 2.6020460415900787e-07, "loss": 3.9567, "step": 3081 }, { "epoch": 2.4363636363636365, "grad_norm": 1.9107775688171387, "learning_rate": 2.5949900431351416e-07, "loss": 3.9816, "step": 3082 }, { "epoch": 2.4371541501976286, "grad_norm": 2.1538960933685303, "learning_rate": 2.5879427185809317e-07, "loss": 3.9781, "step": 3083 }, { "epoch": 2.4379446640316207, "grad_norm": 2.0495336055755615, "learning_rate": 2.58090407285512e-07, "loss": 4.3239, "step": 3084 }, { "epoch": 2.438735177865613, "grad_norm": 2.0090699195861816, "learning_rate": 2.573874110879297e-07, "loss": 4.2055, "step": 3085 }, { "epoch": 2.439525691699605, "grad_norm": 2.164705753326416, "learning_rate": 2.566852837568999e-07, "loss": 3.9445, "step": 3086 }, { "epoch": 2.440316205533597, "grad_norm": 1.987374186515808, "learning_rate": 2.55984025783368e-07, "loss": 4.3015, "step": 3087 }, { "epoch": 2.441106719367589, "grad_norm": 2.032668352127075, "learning_rate": 2.552836376576713e-07, "loss": 4.3018, "step": 3088 }, { "epoch": 2.441897233201581, "grad_norm": 2.1808509826660156, "learning_rate": 2.545841198695395e-07, "loss": 3.7895, "step": 3089 }, { "epoch": 2.4426877470355732, "grad_norm": 2.0798017978668213, "learning_rate": 2.5388547290809295e-07, "loss": 4.2466, "step": 3090 }, { "epoch": 2.4434782608695653, "grad_norm": 2.0500648021698, "learning_rate": 2.5318769726184373e-07, "loss": 4.177, "step": 3091 }, { "epoch": 2.4442687747035574, "grad_norm": 2.3244736194610596, "learning_rate": 2.5249079341869487e-07, "loss": 3.7421, "step": 3092 }, { "epoch": 2.4450592885375495, "grad_norm": 1.9574882984161377, "learning_rate": 2.517947618659384e-07, "loss": 4.2179, "step": 3093 }, { "epoch": 2.4458498023715416, "grad_norm": 2.216045379638672, "learning_rate": 2.510996030902581e-07, "loss": 3.7537, "step": 3094 }, { "epoch": 2.4466403162055337, "grad_norm": 2.321498155593872, "learning_rate": 2.5040531757772653e-07, "loss": 3.8809, "step": 3095 }, { "epoch": 2.4474308300395258, "grad_norm": 2.2017929553985596, "learning_rate": 2.4971190581380613e-07, "loss": 4.0612, "step": 3096 }, { "epoch": 2.448221343873518, "grad_norm": 4.793033123016357, "learning_rate": 2.4901936828334785e-07, "loss": 3.709, "step": 3097 }, { "epoch": 2.44901185770751, "grad_norm": 2.385693311691284, "learning_rate": 2.48327705470592e-07, "loss": 4.0844, "step": 3098 }, { "epoch": 2.449802371541502, "grad_norm": 2.0894124507904053, "learning_rate": 2.4763691785916675e-07, "loss": 4.1972, "step": 3099 }, { "epoch": 2.450592885375494, "grad_norm": 2.1643283367156982, "learning_rate": 2.469470059320885e-07, "loss": 3.9449, "step": 3100 }, { "epoch": 2.4513833992094862, "grad_norm": 2.2001187801361084, "learning_rate": 2.4625797017176195e-07, "loss": 3.6582, "step": 3101 }, { "epoch": 2.4521739130434783, "grad_norm": 2.130486488342285, "learning_rate": 2.4556981105997713e-07, "loss": 4.1682, "step": 3102 }, { "epoch": 2.4529644268774704, "grad_norm": 2.067145347595215, "learning_rate": 2.4488252907791366e-07, "loss": 4.0987, "step": 3103 }, { "epoch": 2.4537549407114625, "grad_norm": 2.1043105125427246, "learning_rate": 2.441961247061368e-07, "loss": 4.2116, "step": 3104 }, { "epoch": 2.4545454545454546, "grad_norm": 2.0998663902282715, "learning_rate": 2.4351059842459743e-07, "loss": 4.1286, "step": 3105 }, { "epoch": 2.4553359683794467, "grad_norm": 2.1561925411224365, "learning_rate": 2.4282595071263337e-07, "loss": 3.9258, "step": 3106 }, { "epoch": 2.4561264822134388, "grad_norm": 2.0974924564361572, "learning_rate": 2.4214218204896777e-07, "loss": 4.0685, "step": 3107 }, { "epoch": 2.456916996047431, "grad_norm": 2.2154741287231445, "learning_rate": 2.414592929117092e-07, "loss": 3.8304, "step": 3108 }, { "epoch": 2.457707509881423, "grad_norm": 2.0132498741149902, "learning_rate": 2.4077728377835123e-07, "loss": 4.0129, "step": 3109 }, { "epoch": 2.458498023715415, "grad_norm": 2.520530939102173, "learning_rate": 2.4009615512577223e-07, "loss": 3.9286, "step": 3110 }, { "epoch": 2.459288537549407, "grad_norm": 2.3373162746429443, "learning_rate": 2.3941590743023486e-07, "loss": 3.6783, "step": 3111 }, { "epoch": 2.460079051383399, "grad_norm": 2.64737868309021, "learning_rate": 2.387365411673855e-07, "loss": 3.5653, "step": 3112 }, { "epoch": 2.4608695652173913, "grad_norm": 1.9240339994430542, "learning_rate": 2.380580568122548e-07, "loss": 4.3832, "step": 3113 }, { "epoch": 2.4616600790513834, "grad_norm": 2.1175951957702637, "learning_rate": 2.3738045483925597e-07, "loss": 3.9848, "step": 3114 }, { "epoch": 2.4624505928853755, "grad_norm": 1.9482421875, "learning_rate": 2.3670373572218534e-07, "loss": 4.3504, "step": 3115 }, { "epoch": 2.4632411067193676, "grad_norm": 1.9997698068618774, "learning_rate": 2.3602789993422359e-07, "loss": 4.094, "step": 3116 }, { "epoch": 2.4640316205533597, "grad_norm": 2.2315850257873535, "learning_rate": 2.3535294794793116e-07, "loss": 3.8729, "step": 3117 }, { "epoch": 2.4648221343873518, "grad_norm": 2.1245148181915283, "learning_rate": 2.3467888023525225e-07, "loss": 4.0709, "step": 3118 }, { "epoch": 2.465612648221344, "grad_norm": 2.490997314453125, "learning_rate": 2.340056972675123e-07, "loss": 3.8207, "step": 3119 }, { "epoch": 2.466403162055336, "grad_norm": 2.2958500385284424, "learning_rate": 2.3333339951541783e-07, "loss": 3.9926, "step": 3120 }, { "epoch": 2.467193675889328, "grad_norm": 2.4553287029266357, "learning_rate": 2.3266198744905724e-07, "loss": 3.5275, "step": 3121 }, { "epoch": 2.46798418972332, "grad_norm": 2.1264772415161133, "learning_rate": 2.319914615378984e-07, "loss": 4.09, "step": 3122 }, { "epoch": 2.468774703557312, "grad_norm": 2.2540273666381836, "learning_rate": 2.3132182225078991e-07, "loss": 3.9342, "step": 3123 }, { "epoch": 2.4695652173913043, "grad_norm": 2.154510736465454, "learning_rate": 2.3065307005596205e-07, "loss": 3.7632, "step": 3124 }, { "epoch": 2.4703557312252964, "grad_norm": 2.429630994796753, "learning_rate": 2.2998520542102226e-07, "loss": 3.6183, "step": 3125 }, { "epoch": 2.4711462450592885, "grad_norm": 2.0377211570739746, "learning_rate": 2.29318228812959e-07, "loss": 4.2509, "step": 3126 }, { "epoch": 2.4719367588932806, "grad_norm": 3.4509265422821045, "learning_rate": 2.2865214069813938e-07, "loss": 4.0329, "step": 3127 }, { "epoch": 2.4727272727272727, "grad_norm": 2.3339133262634277, "learning_rate": 2.2798694154230936e-07, "loss": 4.1577, "step": 3128 }, { "epoch": 2.4735177865612648, "grad_norm": 2.0677411556243896, "learning_rate": 2.2732263181059305e-07, "loss": 4.0025, "step": 3129 }, { "epoch": 2.474308300395257, "grad_norm": 2.045476198196411, "learning_rate": 2.2665921196749333e-07, "loss": 4.1028, "step": 3130 }, { "epoch": 2.475098814229249, "grad_norm": 2.00045108795166, "learning_rate": 2.2599668247688936e-07, "loss": 3.9701, "step": 3131 }, { "epoch": 2.475889328063241, "grad_norm": 2.151320457458496, "learning_rate": 2.2533504380203972e-07, "loss": 3.8484, "step": 3132 }, { "epoch": 2.476679841897233, "grad_norm": 2.2575740814208984, "learning_rate": 2.2467429640557902e-07, "loss": 3.3374, "step": 3133 }, { "epoch": 2.477470355731225, "grad_norm": 2.0503833293914795, "learning_rate": 2.2401444074951816e-07, "loss": 4.1766, "step": 3134 }, { "epoch": 2.4782608695652173, "grad_norm": 2.317464590072632, "learning_rate": 2.2335547729524507e-07, "loss": 3.8879, "step": 3135 }, { "epoch": 2.4790513833992094, "grad_norm": 2.2077839374542236, "learning_rate": 2.226974065035249e-07, "loss": 3.9421, "step": 3136 }, { "epoch": 2.4798418972332015, "grad_norm": 2.159358024597168, "learning_rate": 2.2204022883449665e-07, "loss": 3.7089, "step": 3137 }, { "epoch": 2.4806324110671936, "grad_norm": 2.030165910720825, "learning_rate": 2.2138394474767614e-07, "loss": 4.2626, "step": 3138 }, { "epoch": 2.4814229249011857, "grad_norm": 1.9967448711395264, "learning_rate": 2.207285547019538e-07, "loss": 4.2187, "step": 3139 }, { "epoch": 2.4822134387351777, "grad_norm": 2.2570159435272217, "learning_rate": 2.2007405915559552e-07, "loss": 4.2146, "step": 3140 }, { "epoch": 2.48300395256917, "grad_norm": 2.2375917434692383, "learning_rate": 2.1942045856624093e-07, "loss": 4.1379, "step": 3141 }, { "epoch": 2.483794466403162, "grad_norm": 1.9549180269241333, "learning_rate": 2.1876775339090493e-07, "loss": 4.2118, "step": 3142 }, { "epoch": 2.484584980237154, "grad_norm": 2.217287063598633, "learning_rate": 2.1811594408597464e-07, "loss": 4.1464, "step": 3143 }, { "epoch": 2.485375494071146, "grad_norm": 2.2741847038269043, "learning_rate": 2.1746503110721262e-07, "loss": 3.7321, "step": 3144 }, { "epoch": 2.486166007905138, "grad_norm": 2.3511312007904053, "learning_rate": 2.168150149097543e-07, "loss": 3.7751, "step": 3145 }, { "epoch": 2.4869565217391303, "grad_norm": 2.092973470687866, "learning_rate": 2.1616589594810647e-07, "loss": 4.0622, "step": 3146 }, { "epoch": 2.4877470355731224, "grad_norm": 1.935765266418457, "learning_rate": 2.1551767467615058e-07, "loss": 4.2293, "step": 3147 }, { "epoch": 2.4885375494071145, "grad_norm": 2.0693187713623047, "learning_rate": 2.1487035154713918e-07, "loss": 4.0523, "step": 3148 }, { "epoch": 2.4893280632411066, "grad_norm": 2.251652956008911, "learning_rate": 2.1422392701369742e-07, "loss": 4.0174, "step": 3149 }, { "epoch": 2.4901185770750986, "grad_norm": 2.12834095954895, "learning_rate": 2.135784015278222e-07, "loss": 3.9689, "step": 3150 }, { "epoch": 2.4909090909090907, "grad_norm": 2.1804745197296143, "learning_rate": 2.1293377554088016e-07, "loss": 3.8289, "step": 3151 }, { "epoch": 2.491699604743083, "grad_norm": 2.194683074951172, "learning_rate": 2.1229004950361175e-07, "loss": 3.7987, "step": 3152 }, { "epoch": 2.492490118577075, "grad_norm": 2.1722021102905273, "learning_rate": 2.116472238661264e-07, "loss": 3.7328, "step": 3153 }, { "epoch": 2.493280632411067, "grad_norm": 2.114225387573242, "learning_rate": 2.110052990779037e-07, "loss": 3.9733, "step": 3154 }, { "epoch": 2.494071146245059, "grad_norm": 2.079434871673584, "learning_rate": 2.1036427558779437e-07, "loss": 3.9139, "step": 3155 }, { "epoch": 2.494861660079051, "grad_norm": 2.0029075145721436, "learning_rate": 2.097241538440181e-07, "loss": 4.235, "step": 3156 }, { "epoch": 2.4956521739130437, "grad_norm": 2.0823943614959717, "learning_rate": 2.0908493429416474e-07, "loss": 4.2033, "step": 3157 }, { "epoch": 2.4964426877470354, "grad_norm": 2.1790051460266113, "learning_rate": 2.0844661738519295e-07, "loss": 3.9293, "step": 3158 }, { "epoch": 2.497233201581028, "grad_norm": 2.359469413757324, "learning_rate": 2.0780920356343014e-07, "loss": 3.8003, "step": 3159 }, { "epoch": 2.4980237154150196, "grad_norm": 2.0325546264648438, "learning_rate": 2.0717269327457234e-07, "loss": 4.078, "step": 3160 }, { "epoch": 2.498814229249012, "grad_norm": 2.1118600368499756, "learning_rate": 2.06537086963684e-07, "loss": 4.3717, "step": 3161 }, { "epoch": 2.4996047430830037, "grad_norm": 2.113739490509033, "learning_rate": 2.0590238507519747e-07, "loss": 4.225, "step": 3162 }, { "epoch": 2.5003952569169963, "grad_norm": 2.169532537460327, "learning_rate": 2.052685880529121e-07, "loss": 3.9342, "step": 3163 }, { "epoch": 2.501185770750988, "grad_norm": 2.1923017501831055, "learning_rate": 2.0463569633999502e-07, "loss": 4.235, "step": 3164 }, { "epoch": 2.5019762845849804, "grad_norm": 2.0630104541778564, "learning_rate": 2.0400371037898108e-07, "loss": 4.0157, "step": 3165 }, { "epoch": 2.502766798418972, "grad_norm": 1.9832229614257812, "learning_rate": 2.033726306117703e-07, "loss": 4.2104, "step": 3166 }, { "epoch": 2.5035573122529646, "grad_norm": 2.350796699523926, "learning_rate": 2.0274245747963004e-07, "loss": 3.8475, "step": 3167 }, { "epoch": 2.5043478260869563, "grad_norm": 2.2047901153564453, "learning_rate": 2.021131914231935e-07, "loss": 3.8702, "step": 3168 }, { "epoch": 2.505138339920949, "grad_norm": 2.016871213912964, "learning_rate": 2.0148483288245974e-07, "loss": 4.4606, "step": 3169 }, { "epoch": 2.5059288537549405, "grad_norm": 2.042497396469116, "learning_rate": 2.008573822967929e-07, "loss": 3.9323, "step": 3170 }, { "epoch": 2.506719367588933, "grad_norm": 2.321272373199463, "learning_rate": 2.0023084010492276e-07, "loss": 3.7236, "step": 3171 }, { "epoch": 2.5075098814229246, "grad_norm": 2.6632168292999268, "learning_rate": 1.9960520674494353e-07, "loss": 3.8941, "step": 3172 }, { "epoch": 2.508300395256917, "grad_norm": 2.095221996307373, "learning_rate": 1.9898048265431413e-07, "loss": 4.1988, "step": 3173 }, { "epoch": 2.509090909090909, "grad_norm": 2.301815986633301, "learning_rate": 1.9835666826985794e-07, "loss": 4.0134, "step": 3174 }, { "epoch": 2.5098814229249014, "grad_norm": 2.0538153648376465, "learning_rate": 1.9773376402776138e-07, "loss": 4.0888, "step": 3175 }, { "epoch": 2.5106719367588934, "grad_norm": 2.3543028831481934, "learning_rate": 1.9711177036357493e-07, "loss": 4.148, "step": 3176 }, { "epoch": 2.5114624505928855, "grad_norm": 2.028562307357788, "learning_rate": 1.9649068771221345e-07, "loss": 4.3496, "step": 3177 }, { "epoch": 2.5122529644268776, "grad_norm": 2.243901491165161, "learning_rate": 1.9587051650795312e-07, "loss": 4.0077, "step": 3178 }, { "epoch": 2.5130434782608697, "grad_norm": 2.343412160873413, "learning_rate": 1.9525125718443355e-07, "loss": 3.9947, "step": 3179 }, { "epoch": 2.513833992094862, "grad_norm": 2.405358076095581, "learning_rate": 1.9463291017465685e-07, "loss": 4.0409, "step": 3180 }, { "epoch": 2.514624505928854, "grad_norm": 2.1914734840393066, "learning_rate": 1.9401547591098717e-07, "loss": 4.0546, "step": 3181 }, { "epoch": 2.515415019762846, "grad_norm": 2.3689401149749756, "learning_rate": 1.933989548251507e-07, "loss": 3.5411, "step": 3182 }, { "epoch": 2.516205533596838, "grad_norm": 1.927311897277832, "learning_rate": 1.9278334734823388e-07, "loss": 4.4171, "step": 3183 }, { "epoch": 2.51699604743083, "grad_norm": 2.25848650932312, "learning_rate": 1.9216865391068545e-07, "loss": 3.8002, "step": 3184 }, { "epoch": 2.5177865612648223, "grad_norm": 2.262028455734253, "learning_rate": 1.9155487494231573e-07, "loss": 3.824, "step": 3185 }, { "epoch": 2.5185770750988143, "grad_norm": 2.171861171722412, "learning_rate": 1.9094201087229356e-07, "loss": 3.8736, "step": 3186 }, { "epoch": 2.5193675889328064, "grad_norm": 2.131624937057495, "learning_rate": 1.903300621291499e-07, "loss": 4.055, "step": 3187 }, { "epoch": 2.5201581027667985, "grad_norm": 2.1333742141723633, "learning_rate": 1.8971902914077455e-07, "loss": 3.9398, "step": 3188 }, { "epoch": 2.5209486166007906, "grad_norm": 2.1024250984191895, "learning_rate": 1.891089123344178e-07, "loss": 3.9785, "step": 3189 }, { "epoch": 2.5217391304347827, "grad_norm": 2.176284074783325, "learning_rate": 1.884997121366887e-07, "loss": 4.0063, "step": 3190 }, { "epoch": 2.522529644268775, "grad_norm": 2.2004356384277344, "learning_rate": 1.8789142897355605e-07, "loss": 3.969, "step": 3191 }, { "epoch": 2.523320158102767, "grad_norm": 2.564088821411133, "learning_rate": 1.872840632703461e-07, "loss": 3.7397, "step": 3192 }, { "epoch": 2.524110671936759, "grad_norm": 2.4307358264923096, "learning_rate": 1.8667761545174528e-07, "loss": 3.7413, "step": 3193 }, { "epoch": 2.524901185770751, "grad_norm": 2.445683240890503, "learning_rate": 1.8607208594179777e-07, "loss": 3.781, "step": 3194 }, { "epoch": 2.525691699604743, "grad_norm": 2.328058958053589, "learning_rate": 1.8546747516390432e-07, "loss": 3.844, "step": 3195 }, { "epoch": 2.5264822134387352, "grad_norm": 2.123363494873047, "learning_rate": 1.8486378354082452e-07, "loss": 4.0188, "step": 3196 }, { "epoch": 2.5272727272727273, "grad_norm": 2.176330089569092, "learning_rate": 1.842610114946758e-07, "loss": 3.9809, "step": 3197 }, { "epoch": 2.5280632411067194, "grad_norm": 2.1355557441711426, "learning_rate": 1.8365915944693102e-07, "loss": 3.9346, "step": 3198 }, { "epoch": 2.5288537549407115, "grad_norm": 2.056623697280884, "learning_rate": 1.8305822781842073e-07, "loss": 4.0205, "step": 3199 }, { "epoch": 2.5296442687747036, "grad_norm": 2.2976419925689697, "learning_rate": 1.8245821702933185e-07, "loss": 3.7272, "step": 3200 }, { "epoch": 2.5304347826086957, "grad_norm": 2.0103373527526855, "learning_rate": 1.8185912749920707e-07, "loss": 4.2704, "step": 3201 }, { "epoch": 2.531225296442688, "grad_norm": 2.079150676727295, "learning_rate": 1.812609596469455e-07, "loss": 3.8874, "step": 3202 }, { "epoch": 2.53201581027668, "grad_norm": 2.049501895904541, "learning_rate": 1.806637138908014e-07, "loss": 4.3221, "step": 3203 }, { "epoch": 2.532806324110672, "grad_norm": 2.084683656692505, "learning_rate": 1.800673906483835e-07, "loss": 4.0538, "step": 3204 }, { "epoch": 2.533596837944664, "grad_norm": 2.3255069255828857, "learning_rate": 1.7947199033665735e-07, "loss": 4.2119, "step": 3205 }, { "epoch": 2.534387351778656, "grad_norm": 2.138888359069824, "learning_rate": 1.7887751337194186e-07, "loss": 4.0529, "step": 3206 }, { "epoch": 2.5351778656126482, "grad_norm": 1.96915602684021, "learning_rate": 1.7828396016990988e-07, "loss": 4.0121, "step": 3207 }, { "epoch": 2.5359683794466403, "grad_norm": 2.403768539428711, "learning_rate": 1.7769133114558962e-07, "loss": 3.9032, "step": 3208 }, { "epoch": 2.5367588932806324, "grad_norm": 2.188307523727417, "learning_rate": 1.7709962671336237e-07, "loss": 3.8866, "step": 3209 }, { "epoch": 2.5375494071146245, "grad_norm": 2.158127546310425, "learning_rate": 1.7650884728696291e-07, "loss": 4.1662, "step": 3210 }, { "epoch": 2.5383399209486166, "grad_norm": 2.4125447273254395, "learning_rate": 1.759189932794798e-07, "loss": 3.4012, "step": 3211 }, { "epoch": 2.5391304347826087, "grad_norm": 2.0766751766204834, "learning_rate": 1.753300651033531e-07, "loss": 3.8887, "step": 3212 }, { "epoch": 2.539920948616601, "grad_norm": 2.0470476150512695, "learning_rate": 1.7474206317037722e-07, "loss": 3.9321, "step": 3213 }, { "epoch": 2.540711462450593, "grad_norm": 2.1649270057678223, "learning_rate": 1.7415498789169847e-07, "loss": 4.038, "step": 3214 }, { "epoch": 2.541501976284585, "grad_norm": 2.3775672912597656, "learning_rate": 1.7356883967781417e-07, "loss": 3.7194, "step": 3215 }, { "epoch": 2.542292490118577, "grad_norm": 2.324707269668579, "learning_rate": 1.7298361893857444e-07, "loss": 3.7761, "step": 3216 }, { "epoch": 2.543083003952569, "grad_norm": 4.60009241104126, "learning_rate": 1.7239932608318065e-07, "loss": 4.0082, "step": 3217 }, { "epoch": 2.5438735177865612, "grad_norm": 2.391746759414673, "learning_rate": 1.718159615201853e-07, "loss": 3.6504, "step": 3218 }, { "epoch": 2.5446640316205533, "grad_norm": 2.1501424312591553, "learning_rate": 1.7123352565749178e-07, "loss": 4.0163, "step": 3219 }, { "epoch": 2.5454545454545454, "grad_norm": 2.210034132003784, "learning_rate": 1.7065201890235433e-07, "loss": 3.9118, "step": 3220 }, { "epoch": 2.5462450592885375, "grad_norm": 2.0437135696411133, "learning_rate": 1.7007144166137733e-07, "loss": 4.139, "step": 3221 }, { "epoch": 2.5470355731225296, "grad_norm": 3.8725552558898926, "learning_rate": 1.6949179434051525e-07, "loss": 3.6565, "step": 3222 }, { "epoch": 2.5478260869565217, "grad_norm": 2.417370319366455, "learning_rate": 1.6891307734507255e-07, "loss": 3.6416, "step": 3223 }, { "epoch": 2.5486166007905138, "grad_norm": 2.1862893104553223, "learning_rate": 1.6833529107970236e-07, "loss": 4.1168, "step": 3224 }, { "epoch": 2.549407114624506, "grad_norm": 2.27724027633667, "learning_rate": 1.6775843594840784e-07, "loss": 3.7889, "step": 3225 }, { "epoch": 2.550197628458498, "grad_norm": 2.2084622383117676, "learning_rate": 1.6718251235454168e-07, "loss": 3.5403, "step": 3226 }, { "epoch": 2.55098814229249, "grad_norm": 2.2026922702789307, "learning_rate": 1.6660752070080326e-07, "loss": 3.7572, "step": 3227 }, { "epoch": 2.551778656126482, "grad_norm": 2.006786584854126, "learning_rate": 1.6603346138924213e-07, "loss": 4.3641, "step": 3228 }, { "epoch": 2.5525691699604742, "grad_norm": 2.1643147468566895, "learning_rate": 1.6546033482125521e-07, "loss": 3.9609, "step": 3229 }, { "epoch": 2.5533596837944663, "grad_norm": 2.660752058029175, "learning_rate": 1.6488814139758712e-07, "loss": 3.7816, "step": 3230 }, { "epoch": 2.5541501976284584, "grad_norm": 2.310178756713867, "learning_rate": 1.6431688151833013e-07, "loss": 3.6921, "step": 3231 }, { "epoch": 2.5549407114624505, "grad_norm": 2.2747254371643066, "learning_rate": 1.6374655558292439e-07, "loss": 4.039, "step": 3232 }, { "epoch": 2.5557312252964426, "grad_norm": 2.1470658779144287, "learning_rate": 1.631771639901551e-07, "loss": 3.8945, "step": 3233 }, { "epoch": 2.5565217391304347, "grad_norm": 2.33600115776062, "learning_rate": 1.6260870713815657e-07, "loss": 3.8832, "step": 3234 }, { "epoch": 2.5573122529644268, "grad_norm": 2.1579999923706055, "learning_rate": 1.6204118542440856e-07, "loss": 4.0444, "step": 3235 }, { "epoch": 2.558102766798419, "grad_norm": 2.5540242195129395, "learning_rate": 1.6147459924573576e-07, "loss": 4.0633, "step": 3236 }, { "epoch": 2.558893280632411, "grad_norm": 2.2458713054656982, "learning_rate": 1.6090894899831014e-07, "loss": 4.2368, "step": 3237 }, { "epoch": 2.559683794466403, "grad_norm": 2.258563280105591, "learning_rate": 1.603442350776495e-07, "loss": 3.9427, "step": 3238 }, { "epoch": 2.560474308300395, "grad_norm": 2.069934368133545, "learning_rate": 1.5978045787861545e-07, "loss": 4.3672, "step": 3239 }, { "epoch": 2.561264822134387, "grad_norm": 2.1225597858428955, "learning_rate": 1.5921761779541566e-07, "loss": 4.1057, "step": 3240 }, { "epoch": 2.5620553359683793, "grad_norm": 2.2829549312591553, "learning_rate": 1.5865571522160265e-07, "loss": 3.754, "step": 3241 }, { "epoch": 2.5628458498023714, "grad_norm": 2.151785135269165, "learning_rate": 1.5809475055007265e-07, "loss": 4.0885, "step": 3242 }, { "epoch": 2.5636363636363635, "grad_norm": 2.2758569717407227, "learning_rate": 1.5753472417306725e-07, "loss": 3.3272, "step": 3243 }, { "epoch": 2.5644268774703556, "grad_norm": 2.281559944152832, "learning_rate": 1.5697563648217035e-07, "loss": 3.5004, "step": 3244 }, { "epoch": 2.5652173913043477, "grad_norm": 2.2179551124572754, "learning_rate": 1.564174878683104e-07, "loss": 3.7406, "step": 3245 }, { "epoch": 2.5660079051383398, "grad_norm": 2.280735731124878, "learning_rate": 1.5586027872176024e-07, "loss": 4.0256, "step": 3246 }, { "epoch": 2.566798418972332, "grad_norm": 2.195708751678467, "learning_rate": 1.5530400943213385e-07, "loss": 4.0327, "step": 3247 }, { "epoch": 2.5675889328063244, "grad_norm": 2.0624046325683594, "learning_rate": 1.5474868038838928e-07, "loss": 3.8574, "step": 3248 }, { "epoch": 2.568379446640316, "grad_norm": 2.015899658203125, "learning_rate": 1.5419429197882662e-07, "loss": 4.3255, "step": 3249 }, { "epoch": 2.5691699604743086, "grad_norm": 2.1521859169006348, "learning_rate": 1.5364084459108874e-07, "loss": 3.9251, "step": 3250 }, { "epoch": 2.5699604743083, "grad_norm": 2.012197971343994, "learning_rate": 1.530883386121601e-07, "loss": 4.2519, "step": 3251 }, { "epoch": 2.5707509881422927, "grad_norm": 2.249691963195801, "learning_rate": 1.5253677442836734e-07, "loss": 3.6845, "step": 3252 }, { "epoch": 2.5715415019762844, "grad_norm": 1.9893183708190918, "learning_rate": 1.519861524253773e-07, "loss": 4.2585, "step": 3253 }, { "epoch": 2.572332015810277, "grad_norm": 2.082404136657715, "learning_rate": 1.514364729881998e-07, "loss": 4.1616, "step": 3254 }, { "epoch": 2.5731225296442686, "grad_norm": 2.026745557785034, "learning_rate": 1.508877365011851e-07, "loss": 4.1479, "step": 3255 }, { "epoch": 2.573913043478261, "grad_norm": 2.1400108337402344, "learning_rate": 1.503399433480231e-07, "loss": 4.1874, "step": 3256 }, { "epoch": 2.5747035573122528, "grad_norm": 2.089928388595581, "learning_rate": 1.4979309391174456e-07, "loss": 4.1134, "step": 3257 }, { "epoch": 2.5754940711462453, "grad_norm": 2.1032488346099854, "learning_rate": 1.4924718857472181e-07, "loss": 4.0145, "step": 3258 }, { "epoch": 2.576284584980237, "grad_norm": 2.7864811420440674, "learning_rate": 1.4870222771866482e-07, "loss": 3.8069, "step": 3259 }, { "epoch": 2.5770750988142295, "grad_norm": 2.1247146129608154, "learning_rate": 1.4815821172462445e-07, "loss": 4.1255, "step": 3260 }, { "epoch": 2.577865612648221, "grad_norm": 2.140847682952881, "learning_rate": 1.476151409729906e-07, "loss": 3.9752, "step": 3261 }, { "epoch": 2.5786561264822137, "grad_norm": 2.89192271232605, "learning_rate": 1.4707301584349236e-07, "loss": 3.8279, "step": 3262 }, { "epoch": 2.5794466403162053, "grad_norm": 2.07906436920166, "learning_rate": 1.4653183671519748e-07, "loss": 4.0433, "step": 3263 }, { "epoch": 2.580237154150198, "grad_norm": 2.1485490798950195, "learning_rate": 1.4599160396651263e-07, "loss": 4.0969, "step": 3264 }, { "epoch": 2.5810276679841895, "grad_norm": 2.157965660095215, "learning_rate": 1.4545231797518126e-07, "loss": 3.8023, "step": 3265 }, { "epoch": 2.581818181818182, "grad_norm": 2.012155532836914, "learning_rate": 1.4491397911828696e-07, "loss": 4.0862, "step": 3266 }, { "epoch": 2.5826086956521737, "grad_norm": 2.147394895553589, "learning_rate": 1.4437658777225027e-07, "loss": 3.5412, "step": 3267 }, { "epoch": 2.583399209486166, "grad_norm": 2.120366096496582, "learning_rate": 1.438401443128281e-07, "loss": 4.0068, "step": 3268 }, { "epoch": 2.584189723320158, "grad_norm": 2.083758592605591, "learning_rate": 1.4330464911511577e-07, "loss": 3.8882, "step": 3269 }, { "epoch": 2.5849802371541504, "grad_norm": 2.156118631362915, "learning_rate": 1.427701025535454e-07, "loss": 4.0936, "step": 3270 }, { "epoch": 2.585770750988142, "grad_norm": 2.086678981781006, "learning_rate": 1.422365050018855e-07, "loss": 4.2569, "step": 3271 }, { "epoch": 2.5865612648221346, "grad_norm": 2.02467679977417, "learning_rate": 1.417038568332415e-07, "loss": 4.1018, "step": 3272 }, { "epoch": 2.5873517786561266, "grad_norm": 2.101095676422119, "learning_rate": 1.4117215842005376e-07, "loss": 4.0754, "step": 3273 }, { "epoch": 2.5881422924901187, "grad_norm": 2.0789690017700195, "learning_rate": 1.4064141013410043e-07, "loss": 4.0476, "step": 3274 }, { "epoch": 2.588932806324111, "grad_norm": 2.1072561740875244, "learning_rate": 1.401116123464941e-07, "loss": 4.0283, "step": 3275 }, { "epoch": 2.589723320158103, "grad_norm": 2.07308030128479, "learning_rate": 1.395827654276825e-07, "loss": 4.1898, "step": 3276 }, { "epoch": 2.590513833992095, "grad_norm": 2.060724973678589, "learning_rate": 1.3905486974744913e-07, "loss": 4.0471, "step": 3277 }, { "epoch": 2.591304347826087, "grad_norm": 2.2855546474456787, "learning_rate": 1.3852792567491223e-07, "loss": 3.8787, "step": 3278 }, { "epoch": 2.592094861660079, "grad_norm": 2.1855080127716064, "learning_rate": 1.3800193357852458e-07, "loss": 4.0029, "step": 3279 }, { "epoch": 2.5928853754940713, "grad_norm": 2.1281347274780273, "learning_rate": 1.3747689382607348e-07, "loss": 3.9895, "step": 3280 }, { "epoch": 2.5936758893280634, "grad_norm": 2.197705030441284, "learning_rate": 1.3695280678467992e-07, "loss": 3.9916, "step": 3281 }, { "epoch": 2.5944664031620555, "grad_norm": 2.137338161468506, "learning_rate": 1.364296728207991e-07, "loss": 3.8077, "step": 3282 }, { "epoch": 2.5952569169960475, "grad_norm": 2.1228413581848145, "learning_rate": 1.3590749230021975e-07, "loss": 4.0937, "step": 3283 }, { "epoch": 2.5960474308300396, "grad_norm": 2.1605641841888428, "learning_rate": 1.3538626558806417e-07, "loss": 4.0048, "step": 3284 }, { "epoch": 2.5968379446640317, "grad_norm": 2.148310899734497, "learning_rate": 1.348659930487866e-07, "loss": 4.0792, "step": 3285 }, { "epoch": 2.597628458498024, "grad_norm": 2.2601075172424316, "learning_rate": 1.3434667504617525e-07, "loss": 3.7154, "step": 3286 }, { "epoch": 2.598418972332016, "grad_norm": 1.9780309200286865, "learning_rate": 1.3382831194335114e-07, "loss": 4.1631, "step": 3287 }, { "epoch": 2.599209486166008, "grad_norm": 2.024808168411255, "learning_rate": 1.3331090410276653e-07, "loss": 4.3564, "step": 3288 }, { "epoch": 2.6, "grad_norm": 2.0479533672332764, "learning_rate": 1.3279445188620614e-07, "loss": 4.3405, "step": 3289 }, { "epoch": 2.600790513833992, "grad_norm": 2.190915822982788, "learning_rate": 1.3227895565478681e-07, "loss": 4.1205, "step": 3290 }, { "epoch": 2.6015810276679843, "grad_norm": 2.059741497039795, "learning_rate": 1.3176441576895653e-07, "loss": 4.3225, "step": 3291 }, { "epoch": 2.6023715415019764, "grad_norm": 2.0917840003967285, "learning_rate": 1.3125083258849503e-07, "loss": 3.9573, "step": 3292 }, { "epoch": 2.6031620553359685, "grad_norm": 2.1105539798736572, "learning_rate": 1.30738206472513e-07, "loss": 4.0087, "step": 3293 }, { "epoch": 2.6039525691699605, "grad_norm": 2.1082589626312256, "learning_rate": 1.3022653777945092e-07, "loss": 4.2365, "step": 3294 }, { "epoch": 2.6047430830039526, "grad_norm": 2.2463440895080566, "learning_rate": 1.297158268670813e-07, "loss": 3.9694, "step": 3295 }, { "epoch": 2.6055335968379447, "grad_norm": 2.177652597427368, "learning_rate": 1.2920607409250656e-07, "loss": 3.8806, "step": 3296 }, { "epoch": 2.606324110671937, "grad_norm": 2.257054090499878, "learning_rate": 1.2869727981215818e-07, "loss": 3.687, "step": 3297 }, { "epoch": 2.607114624505929, "grad_norm": 1.951479434967041, "learning_rate": 1.281894443817982e-07, "loss": 4.0688, "step": 3298 }, { "epoch": 2.607905138339921, "grad_norm": 2.517857789993286, "learning_rate": 1.2768256815651892e-07, "loss": 3.9149, "step": 3299 }, { "epoch": 2.608695652173913, "grad_norm": 2.3040452003479004, "learning_rate": 1.2717665149074048e-07, "loss": 3.5268, "step": 3300 }, { "epoch": 2.609486166007905, "grad_norm": 2.068089246749878, "learning_rate": 1.2667169473821326e-07, "loss": 3.9219, "step": 3301 }, { "epoch": 2.6102766798418973, "grad_norm": 2.0403122901916504, "learning_rate": 1.2616769825201502e-07, "loss": 4.1658, "step": 3302 }, { "epoch": 2.6110671936758894, "grad_norm": 2.30795955657959, "learning_rate": 1.2566466238455415e-07, "loss": 3.9782, "step": 3303 }, { "epoch": 2.6118577075098814, "grad_norm": 2.1656546592712402, "learning_rate": 1.2516258748756583e-07, "loss": 3.9415, "step": 3304 }, { "epoch": 2.6126482213438735, "grad_norm": 2.111265182495117, "learning_rate": 1.246614739121135e-07, "loss": 3.8564, "step": 3305 }, { "epoch": 2.6134387351778656, "grad_norm": 2.2290780544281006, "learning_rate": 1.2416132200858833e-07, "loss": 3.9052, "step": 3306 }, { "epoch": 2.6142292490118577, "grad_norm": 2.422417163848877, "learning_rate": 1.236621321267103e-07, "loss": 3.6974, "step": 3307 }, { "epoch": 2.61501976284585, "grad_norm": 1.984999656677246, "learning_rate": 1.231639046155249e-07, "loss": 4.0471, "step": 3308 }, { "epoch": 2.615810276679842, "grad_norm": 1.9213417768478394, "learning_rate": 1.2266663982340582e-07, "loss": 4.2172, "step": 3309 }, { "epoch": 2.616600790513834, "grad_norm": 2.0755867958068848, "learning_rate": 1.221703380980534e-07, "loss": 4.0979, "step": 3310 }, { "epoch": 2.617391304347826, "grad_norm": 2.0905864238739014, "learning_rate": 1.2167499978649456e-07, "loss": 4.045, "step": 3311 }, { "epoch": 2.618181818181818, "grad_norm": 2.426140546798706, "learning_rate": 1.211806252350824e-07, "loss": 3.8073, "step": 3312 }, { "epoch": 2.6189723320158103, "grad_norm": 2.091276168823242, "learning_rate": 1.2068721478949658e-07, "loss": 4.0831, "step": 3313 }, { "epoch": 2.6197628458498023, "grad_norm": 2.1138296127319336, "learning_rate": 1.2019476879474145e-07, "loss": 3.8596, "step": 3314 }, { "epoch": 2.6205533596837944, "grad_norm": 2.1411848068237305, "learning_rate": 1.197032875951489e-07, "loss": 3.9946, "step": 3315 }, { "epoch": 2.6213438735177865, "grad_norm": 2.1281697750091553, "learning_rate": 1.192127715343751e-07, "loss": 4.1156, "step": 3316 }, { "epoch": 2.6221343873517786, "grad_norm": 2.478149175643921, "learning_rate": 1.1872322095540078e-07, "loss": 3.6378, "step": 3317 }, { "epoch": 2.6229249011857707, "grad_norm": 2.3075129985809326, "learning_rate": 1.1823463620053249e-07, "loss": 4.167, "step": 3318 }, { "epoch": 2.623715415019763, "grad_norm": 2.458803653717041, "learning_rate": 1.1774701761140144e-07, "loss": 3.9227, "step": 3319 }, { "epoch": 2.624505928853755, "grad_norm": 2.116529941558838, "learning_rate": 1.1726036552896274e-07, "loss": 3.9593, "step": 3320 }, { "epoch": 2.625296442687747, "grad_norm": 2.223292827606201, "learning_rate": 1.1677468029349619e-07, "loss": 3.9242, "step": 3321 }, { "epoch": 2.626086956521739, "grad_norm": 2.3087055683135986, "learning_rate": 1.1628996224460531e-07, "loss": 3.7273, "step": 3322 }, { "epoch": 2.626877470355731, "grad_norm": 2.178239583969116, "learning_rate": 1.1580621172121748e-07, "loss": 3.9531, "step": 3323 }, { "epoch": 2.6276679841897232, "grad_norm": 2.1179003715515137, "learning_rate": 1.1532342906158311e-07, "loss": 4.1839, "step": 3324 }, { "epoch": 2.6284584980237153, "grad_norm": 2.104525089263916, "learning_rate": 1.1484161460327686e-07, "loss": 3.9698, "step": 3325 }, { "epoch": 2.6292490118577074, "grad_norm": 2.099388599395752, "learning_rate": 1.1436076868319489e-07, "loss": 4.0706, "step": 3326 }, { "epoch": 2.6300395256916995, "grad_norm": 2.7553932666778564, "learning_rate": 1.1388089163755744e-07, "loss": 3.7576, "step": 3327 }, { "epoch": 2.6308300395256916, "grad_norm": 2.259288787841797, "learning_rate": 1.1340198380190724e-07, "loss": 3.5042, "step": 3328 }, { "epoch": 2.6316205533596837, "grad_norm": 2.164233446121216, "learning_rate": 1.1292404551110813e-07, "loss": 3.8474, "step": 3329 }, { "epoch": 2.632411067193676, "grad_norm": 1.9776184558868408, "learning_rate": 1.1244707709934727e-07, "loss": 4.3092, "step": 3330 }, { "epoch": 2.633201581027668, "grad_norm": 2.3258533477783203, "learning_rate": 1.1197107890013286e-07, "loss": 3.8818, "step": 3331 }, { "epoch": 2.63399209486166, "grad_norm": 2.1252400875091553, "learning_rate": 1.1149605124629536e-07, "loss": 4.1032, "step": 3332 }, { "epoch": 2.634782608695652, "grad_norm": 2.2036733627319336, "learning_rate": 1.1102199446998657e-07, "loss": 3.9316, "step": 3333 }, { "epoch": 2.635573122529644, "grad_norm": 1.978987216949463, "learning_rate": 1.1054890890267799e-07, "loss": 4.2718, "step": 3334 }, { "epoch": 2.6363636363636362, "grad_norm": 2.123272180557251, "learning_rate": 1.1007679487516436e-07, "loss": 4.0083, "step": 3335 }, { "epoch": 2.6371541501976283, "grad_norm": 2.1444482803344727, "learning_rate": 1.0960565271755962e-07, "loss": 3.9284, "step": 3336 }, { "epoch": 2.6379446640316204, "grad_norm": 2.0230698585510254, "learning_rate": 1.0913548275929792e-07, "loss": 4.0858, "step": 3337 }, { "epoch": 2.6387351778656125, "grad_norm": 2.070610761642456, "learning_rate": 1.0866628532913459e-07, "loss": 3.789, "step": 3338 }, { "epoch": 2.6395256916996046, "grad_norm": 2.055295705795288, "learning_rate": 1.0819806075514438e-07, "loss": 4.0713, "step": 3339 }, { "epoch": 2.6403162055335967, "grad_norm": 2.062252998352051, "learning_rate": 1.0773080936472174e-07, "loss": 4.2094, "step": 3340 }, { "epoch": 2.641106719367589, "grad_norm": 2.2331862449645996, "learning_rate": 1.0726453148458115e-07, "loss": 3.8094, "step": 3341 }, { "epoch": 2.641897233201581, "grad_norm": 2.221052885055542, "learning_rate": 1.0679922744075582e-07, "loss": 3.5504, "step": 3342 }, { "epoch": 2.642687747035573, "grad_norm": 2.218928813934326, "learning_rate": 1.0633489755859833e-07, "loss": 4.0226, "step": 3343 }, { "epoch": 2.643478260869565, "grad_norm": 2.0288712978363037, "learning_rate": 1.0587154216277995e-07, "loss": 4.2064, "step": 3344 }, { "epoch": 2.6442687747035576, "grad_norm": 2.2592074871063232, "learning_rate": 1.0540916157729102e-07, "loss": 4.0956, "step": 3345 }, { "epoch": 2.6450592885375492, "grad_norm": 2.27921986579895, "learning_rate": 1.0494775612543945e-07, "loss": 4.1242, "step": 3346 }, { "epoch": 2.6458498023715418, "grad_norm": 2.307100534439087, "learning_rate": 1.0448732612985145e-07, "loss": 3.9359, "step": 3347 }, { "epoch": 2.6466403162055334, "grad_norm": 2.2456772327423096, "learning_rate": 1.0402787191247238e-07, "loss": 3.7981, "step": 3348 }, { "epoch": 2.647430830039526, "grad_norm": 2.3372066020965576, "learning_rate": 1.035693937945637e-07, "loss": 3.796, "step": 3349 }, { "epoch": 2.6482213438735176, "grad_norm": 2.2038636207580566, "learning_rate": 1.0311189209670518e-07, "loss": 4.1195, "step": 3350 }, { "epoch": 2.64901185770751, "grad_norm": 2.333935499191284, "learning_rate": 1.0265536713879392e-07, "loss": 3.5241, "step": 3351 }, { "epoch": 2.6498023715415018, "grad_norm": 2.0763869285583496, "learning_rate": 1.0219981924004373e-07, "loss": 4.0022, "step": 3352 }, { "epoch": 2.6505928853754943, "grad_norm": 2.3080849647521973, "learning_rate": 1.0174524871898532e-07, "loss": 3.9076, "step": 3353 }, { "epoch": 2.651383399209486, "grad_norm": 2.4141995906829834, "learning_rate": 1.0129165589346646e-07, "loss": 3.8381, "step": 3354 }, { "epoch": 2.6521739130434785, "grad_norm": 2.2510478496551514, "learning_rate": 1.0083904108065007e-07, "loss": 4.0087, "step": 3355 }, { "epoch": 2.65296442687747, "grad_norm": 2.1269381046295166, "learning_rate": 1.0038740459701667e-07, "loss": 4.0893, "step": 3356 }, { "epoch": 2.6537549407114627, "grad_norm": 2.5347068309783936, "learning_rate": 9.993674675836201e-08, "loss": 3.9691, "step": 3357 }, { "epoch": 2.6545454545454543, "grad_norm": 2.186387777328491, "learning_rate": 9.948706787979733e-08, "loss": 4.0314, "step": 3358 }, { "epoch": 2.655335968379447, "grad_norm": 2.2748446464538574, "learning_rate": 9.903836827574947e-08, "loss": 3.627, "step": 3359 }, { "epoch": 2.6561264822134385, "grad_norm": 2.1693313121795654, "learning_rate": 9.859064825996145e-08, "loss": 4.2103, "step": 3360 }, { "epoch": 2.656916996047431, "grad_norm": 2.0217909812927246, "learning_rate": 9.814390814548985e-08, "loss": 3.9686, "step": 3361 }, { "epoch": 2.6577075098814227, "grad_norm": 2.1512279510498047, "learning_rate": 9.76981482447073e-08, "loss": 4.0234, "step": 3362 }, { "epoch": 2.658498023715415, "grad_norm": 2.468512535095215, "learning_rate": 9.725336886929965e-08, "loss": 3.7559, "step": 3363 }, { "epoch": 2.659288537549407, "grad_norm": 2.0593910217285156, "learning_rate": 9.68095703302691e-08, "loss": 3.9913, "step": 3364 }, { "epoch": 2.6600790513833994, "grad_norm": 2.2179484367370605, "learning_rate": 9.636675293793063e-08, "loss": 3.8513, "step": 3365 }, { "epoch": 2.660869565217391, "grad_norm": 2.1155483722686768, "learning_rate": 9.592491700191319e-08, "loss": 4.0075, "step": 3366 }, { "epoch": 2.6616600790513836, "grad_norm": 2.1722323894500732, "learning_rate": 9.548406283115968e-08, "loss": 3.7607, "step": 3367 }, { "epoch": 2.6624505928853752, "grad_norm": 2.1598331928253174, "learning_rate": 9.50441907339275e-08, "loss": 3.7528, "step": 3368 }, { "epoch": 2.6632411067193678, "grad_norm": 2.1846606731414795, "learning_rate": 9.460530101778564e-08, "loss": 3.9022, "step": 3369 }, { "epoch": 2.66403162055336, "grad_norm": 1.9538060426712036, "learning_rate": 9.416739398961727e-08, "loss": 3.9681, "step": 3370 }, { "epoch": 2.664822134387352, "grad_norm": 2.1884379386901855, "learning_rate": 9.37304699556183e-08, "loss": 4.0701, "step": 3371 }, { "epoch": 2.665612648221344, "grad_norm": 2.2601234912872314, "learning_rate": 9.329452922129717e-08, "loss": 3.9886, "step": 3372 }, { "epoch": 2.666403162055336, "grad_norm": 2.1327555179595947, "learning_rate": 9.285957209147472e-08, "loss": 4.1815, "step": 3373 }, { "epoch": 2.667193675889328, "grad_norm": 2.004401683807373, "learning_rate": 9.242559887028451e-08, "loss": 4.1361, "step": 3374 }, { "epoch": 2.6679841897233203, "grad_norm": 2.4439258575439453, "learning_rate": 9.199260986117081e-08, "loss": 4.0795, "step": 3375 }, { "epoch": 2.6687747035573124, "grad_norm": 2.1451117992401123, "learning_rate": 9.15606053668916e-08, "loss": 3.9343, "step": 3376 }, { "epoch": 2.6695652173913045, "grad_norm": 2.3156962394714355, "learning_rate": 9.112958568951523e-08, "loss": 3.7992, "step": 3377 }, { "epoch": 2.6703557312252966, "grad_norm": 2.141719341278076, "learning_rate": 9.069955113042133e-08, "loss": 3.8836, "step": 3378 }, { "epoch": 2.6711462450592887, "grad_norm": 2.286559820175171, "learning_rate": 9.027050199030151e-08, "loss": 3.8146, "step": 3379 }, { "epoch": 2.6719367588932808, "grad_norm": 2.2629919052124023, "learning_rate": 8.984243856915764e-08, "loss": 3.7826, "step": 3380 }, { "epoch": 2.672727272727273, "grad_norm": 2.376903772354126, "learning_rate": 8.941536116630261e-08, "loss": 3.4959, "step": 3381 }, { "epoch": 2.673517786561265, "grad_norm": 2.114168405532837, "learning_rate": 8.898927008036023e-08, "loss": 4.1963, "step": 3382 }, { "epoch": 2.674308300395257, "grad_norm": 2.1581034660339355, "learning_rate": 8.856416560926384e-08, "loss": 4.134, "step": 3383 }, { "epoch": 2.675098814229249, "grad_norm": 2.452641248703003, "learning_rate": 8.814004805025782e-08, "loss": 3.8746, "step": 3384 }, { "epoch": 2.675889328063241, "grad_norm": 2.1481106281280518, "learning_rate": 8.771691769989566e-08, "loss": 4.1834, "step": 3385 }, { "epoch": 2.6766798418972333, "grad_norm": 2.1572425365448, "learning_rate": 8.72947748540413e-08, "loss": 3.9479, "step": 3386 }, { "epoch": 2.6774703557312254, "grad_norm": 2.063657760620117, "learning_rate": 8.687361980786752e-08, "loss": 4.2366, "step": 3387 }, { "epoch": 2.6782608695652175, "grad_norm": 2.126263380050659, "learning_rate": 8.645345285585654e-08, "loss": 4.025, "step": 3388 }, { "epoch": 2.6790513833992096, "grad_norm": 2.262042760848999, "learning_rate": 8.603427429180072e-08, "loss": 3.6677, "step": 3389 }, { "epoch": 2.6798418972332017, "grad_norm": 2.0339736938476562, "learning_rate": 8.561608440879943e-08, "loss": 4.2946, "step": 3390 }, { "epoch": 2.6806324110671937, "grad_norm": 2.1094000339508057, "learning_rate": 8.519888349926241e-08, "loss": 3.7871, "step": 3391 }, { "epoch": 2.681422924901186, "grad_norm": 2.1567299365997314, "learning_rate": 8.478267185490684e-08, "loss": 3.8619, "step": 3392 }, { "epoch": 2.682213438735178, "grad_norm": 2.366164207458496, "learning_rate": 8.436744976675875e-08, "loss": 3.6733, "step": 3393 }, { "epoch": 2.68300395256917, "grad_norm": 2.197983741760254, "learning_rate": 8.395321752515223e-08, "loss": 3.8714, "step": 3394 }, { "epoch": 2.683794466403162, "grad_norm": 2.253312826156616, "learning_rate": 8.353997541972858e-08, "loss": 3.9593, "step": 3395 }, { "epoch": 2.684584980237154, "grad_norm": 2.1254093647003174, "learning_rate": 8.312772373943766e-08, "loss": 3.9167, "step": 3396 }, { "epoch": 2.6853754940711463, "grad_norm": 2.2688353061676025, "learning_rate": 8.271646277253658e-08, "loss": 3.8615, "step": 3397 }, { "epoch": 2.6861660079051384, "grad_norm": 2.2323458194732666, "learning_rate": 8.230619280658897e-08, "loss": 3.8243, "step": 3398 }, { "epoch": 2.6869565217391305, "grad_norm": 2.2587499618530273, "learning_rate": 8.189691412846651e-08, "loss": 3.8049, "step": 3399 }, { "epoch": 2.6877470355731226, "grad_norm": 2.199094772338867, "learning_rate": 8.148862702434712e-08, "loss": 4.1187, "step": 3400 }, { "epoch": 2.6885375494071146, "grad_norm": 2.0432965755462646, "learning_rate": 8.108133177971578e-08, "loss": 4.1125, "step": 3401 }, { "epoch": 2.6893280632411067, "grad_norm": 2.0988006591796875, "learning_rate": 8.067502867936366e-08, "loss": 3.9702, "step": 3402 }, { "epoch": 2.690118577075099, "grad_norm": 2.1995255947113037, "learning_rate": 8.026971800738836e-08, "loss": 3.9233, "step": 3403 }, { "epoch": 2.690909090909091, "grad_norm": 2.3156723976135254, "learning_rate": 7.98654000471935e-08, "loss": 4.2462, "step": 3404 }, { "epoch": 2.691699604743083, "grad_norm": 2.3151638507843018, "learning_rate": 7.946207508148846e-08, "loss": 3.559, "step": 3405 }, { "epoch": 2.692490118577075, "grad_norm": 1.8690992593765259, "learning_rate": 7.905974339228861e-08, "loss": 4.4438, "step": 3406 }, { "epoch": 2.693280632411067, "grad_norm": 2.6611785888671875, "learning_rate": 7.865840526091412e-08, "loss": 4.0352, "step": 3407 }, { "epoch": 2.6940711462450593, "grad_norm": 2.035254955291748, "learning_rate": 7.825806096799099e-08, "loss": 4.0948, "step": 3408 }, { "epoch": 2.6948616600790514, "grad_norm": 3.373739242553711, "learning_rate": 7.785871079345097e-08, "loss": 4.0268, "step": 3409 }, { "epoch": 2.6956521739130435, "grad_norm": 2.0372889041900635, "learning_rate": 7.746035501652887e-08, "loss": 4.1085, "step": 3410 }, { "epoch": 2.6964426877470355, "grad_norm": 2.276451349258423, "learning_rate": 7.706299391576588e-08, "loss": 3.5566, "step": 3411 }, { "epoch": 2.6972332015810276, "grad_norm": 2.0315704345703125, "learning_rate": 7.66666277690069e-08, "loss": 4.2953, "step": 3412 }, { "epoch": 2.6980237154150197, "grad_norm": 2.14501690864563, "learning_rate": 7.627125685340126e-08, "loss": 4.0704, "step": 3413 }, { "epoch": 2.698814229249012, "grad_norm": 2.458604574203491, "learning_rate": 7.587688144540245e-08, "loss": 3.6662, "step": 3414 }, { "epoch": 2.699604743083004, "grad_norm": 2.1529412269592285, "learning_rate": 7.548350182076808e-08, "loss": 4.1646, "step": 3415 }, { "epoch": 2.700395256916996, "grad_norm": 2.885723352432251, "learning_rate": 7.509111825455878e-08, "loss": 3.886, "step": 3416 }, { "epoch": 2.701185770750988, "grad_norm": 2.211162567138672, "learning_rate": 7.469973102113942e-08, "loss": 3.7742, "step": 3417 }, { "epoch": 2.70197628458498, "grad_norm": 2.114293336868286, "learning_rate": 7.430934039417841e-08, "loss": 4.1081, "step": 3418 }, { "epoch": 2.7027667984189723, "grad_norm": 2.0812439918518066, "learning_rate": 7.391994664664625e-08, "loss": 4.0617, "step": 3419 }, { "epoch": 2.7035573122529644, "grad_norm": 2.130643129348755, "learning_rate": 7.353155005081713e-08, "loss": 4.2753, "step": 3420 }, { "epoch": 2.7043478260869565, "grad_norm": 2.228877305984497, "learning_rate": 7.314415087826832e-08, "loss": 3.9075, "step": 3421 }, { "epoch": 2.7051383399209485, "grad_norm": 2.349436044692993, "learning_rate": 7.275774939987879e-08, "loss": 3.5186, "step": 3422 }, { "epoch": 2.7059288537549406, "grad_norm": 2.180612802505493, "learning_rate": 7.237234588583075e-08, "loss": 3.9483, "step": 3423 }, { "epoch": 2.7067193675889327, "grad_norm": 2.0435478687286377, "learning_rate": 7.198794060560765e-08, "loss": 4.0707, "step": 3424 }, { "epoch": 2.707509881422925, "grad_norm": 2.2476489543914795, "learning_rate": 7.160453382799597e-08, "loss": 3.8863, "step": 3425 }, { "epoch": 2.708300395256917, "grad_norm": 2.245502233505249, "learning_rate": 7.122212582108362e-08, "loss": 3.7033, "step": 3426 }, { "epoch": 2.709090909090909, "grad_norm": 2.032092809677124, "learning_rate": 7.084071685225968e-08, "loss": 4.1516, "step": 3427 }, { "epoch": 2.709881422924901, "grad_norm": 2.115539073944092, "learning_rate": 7.0460307188215e-08, "loss": 3.9947, "step": 3428 }, { "epoch": 2.710671936758893, "grad_norm": 2.23571515083313, "learning_rate": 7.008089709494248e-08, "loss": 4.1904, "step": 3429 }, { "epoch": 2.7114624505928853, "grad_norm": 2.36755633354187, "learning_rate": 6.970248683773472e-08, "loss": 3.3964, "step": 3430 }, { "epoch": 2.7122529644268774, "grad_norm": 1.9886525869369507, "learning_rate": 6.93250766811861e-08, "loss": 4.0939, "step": 3431 }, { "epoch": 2.7130434782608694, "grad_norm": 2.3347110748291016, "learning_rate": 6.894866688919132e-08, "loss": 3.6896, "step": 3432 }, { "epoch": 2.7138339920948615, "grad_norm": 2.167959690093994, "learning_rate": 6.857325772494605e-08, "loss": 4.0419, "step": 3433 }, { "epoch": 2.7146245059288536, "grad_norm": 2.261798620223999, "learning_rate": 6.819884945094562e-08, "loss": 3.7578, "step": 3434 }, { "epoch": 2.7154150197628457, "grad_norm": 2.1259453296661377, "learning_rate": 6.78254423289863e-08, "loss": 3.8332, "step": 3435 }, { "epoch": 2.716205533596838, "grad_norm": 2.191232204437256, "learning_rate": 6.745303662016339e-08, "loss": 4.3232, "step": 3436 }, { "epoch": 2.71699604743083, "grad_norm": 2.174882411956787, "learning_rate": 6.708163258487293e-08, "loss": 3.858, "step": 3437 }, { "epoch": 2.717786561264822, "grad_norm": 2.1966848373413086, "learning_rate": 6.671123048281019e-08, "loss": 4.213, "step": 3438 }, { "epoch": 2.718577075098814, "grad_norm": 2.2099432945251465, "learning_rate": 6.63418305729695e-08, "loss": 3.8755, "step": 3439 }, { "epoch": 2.719367588932806, "grad_norm": 2.1087942123413086, "learning_rate": 6.59734331136449e-08, "loss": 4.1375, "step": 3440 }, { "epoch": 2.7201581027667983, "grad_norm": 2.1317977905273438, "learning_rate": 6.560603836242918e-08, "loss": 4.3473, "step": 3441 }, { "epoch": 2.7209486166007903, "grad_norm": 2.084190845489502, "learning_rate": 6.523964657621434e-08, "loss": 4.0866, "step": 3442 }, { "epoch": 2.7217391304347824, "grad_norm": 2.322957992553711, "learning_rate": 6.487425801119079e-08, "loss": 3.8502, "step": 3443 }, { "epoch": 2.722529644268775, "grad_norm": 2.326212167739868, "learning_rate": 6.450987292284766e-08, "loss": 3.5381, "step": 3444 }, { "epoch": 2.7233201581027666, "grad_norm": 2.023808240890503, "learning_rate": 6.414649156597229e-08, "loss": 4.1752, "step": 3445 }, { "epoch": 2.724110671936759, "grad_norm": 2.1393120288848877, "learning_rate": 6.378411419465013e-08, "loss": 4.2356, "step": 3446 }, { "epoch": 2.724901185770751, "grad_norm": 2.306692361831665, "learning_rate": 6.342274106226514e-08, "loss": 3.9336, "step": 3447 }, { "epoch": 2.7256916996047433, "grad_norm": 2.0304148197174072, "learning_rate": 6.30623724214982e-08, "loss": 4.0678, "step": 3448 }, { "epoch": 2.726482213438735, "grad_norm": 2.264172077178955, "learning_rate": 6.270300852432825e-08, "loss": 4.0305, "step": 3449 }, { "epoch": 2.7272727272727275, "grad_norm": 2.1173465251922607, "learning_rate": 6.23446496220323e-08, "loss": 4.2216, "step": 3450 }, { "epoch": 2.728063241106719, "grad_norm": 1.9048599004745483, "learning_rate": 6.198729596518337e-08, "loss": 4.3804, "step": 3451 }, { "epoch": 2.7288537549407117, "grad_norm": 2.8793067932128906, "learning_rate": 6.16309478036528e-08, "loss": 3.9843, "step": 3452 }, { "epoch": 2.7296442687747033, "grad_norm": 1.9125823974609375, "learning_rate": 6.127560538660825e-08, "loss": 4.2872, "step": 3453 }, { "epoch": 2.730434782608696, "grad_norm": 2.265953779220581, "learning_rate": 6.092126896251399e-08, "loss": 3.8712, "step": 3454 }, { "epoch": 2.7312252964426875, "grad_norm": 2.192941904067993, "learning_rate": 6.056793877913164e-08, "loss": 3.8183, "step": 3455 }, { "epoch": 2.73201581027668, "grad_norm": 2.137932300567627, "learning_rate": 6.021561508351808e-08, "loss": 4.0533, "step": 3456 }, { "epoch": 2.7328063241106717, "grad_norm": 2.4002482891082764, "learning_rate": 5.986429812202727e-08, "loss": 3.5167, "step": 3457 }, { "epoch": 2.7335968379446642, "grad_norm": 2.1976253986358643, "learning_rate": 5.951398814030951e-08, "loss": 3.9176, "step": 3458 }, { "epoch": 2.734387351778656, "grad_norm": 2.2591850757598877, "learning_rate": 5.916468538330988e-08, "loss": 3.902, "step": 3459 }, { "epoch": 2.7351778656126484, "grad_norm": 2.074376106262207, "learning_rate": 5.881639009527018e-08, "loss": 4.2861, "step": 3460 }, { "epoch": 2.73596837944664, "grad_norm": 2.0515029430389404, "learning_rate": 5.846910251972726e-08, "loss": 4.3288, "step": 3461 }, { "epoch": 2.7367588932806326, "grad_norm": 2.019735097885132, "learning_rate": 5.8122822899513375e-08, "loss": 4.3106, "step": 3462 }, { "epoch": 2.7375494071146242, "grad_norm": 2.0375239849090576, "learning_rate": 5.777755147675634e-08, "loss": 4.2232, "step": 3463 }, { "epoch": 2.738339920948617, "grad_norm": 1.9884589910507202, "learning_rate": 5.743328849287871e-08, "loss": 4.4592, "step": 3464 }, { "epoch": 2.7391304347826084, "grad_norm": 2.270547389984131, "learning_rate": 5.709003418859793e-08, "loss": 4.1255, "step": 3465 }, { "epoch": 2.739920948616601, "grad_norm": 2.336083173751831, "learning_rate": 5.674778880392617e-08, "loss": 3.8363, "step": 3466 }, { "epoch": 2.740711462450593, "grad_norm": 1.966355562210083, "learning_rate": 5.6406552578170515e-08, "loss": 4.3808, "step": 3467 }, { "epoch": 2.741501976284585, "grad_norm": 2.252253770828247, "learning_rate": 5.606632574993142e-08, "loss": 4.1641, "step": 3468 }, { "epoch": 2.7422924901185772, "grad_norm": 2.1741480827331543, "learning_rate": 5.572710855710444e-08, "loss": 3.9198, "step": 3469 }, { "epoch": 2.7430830039525693, "grad_norm": 2.0798025131225586, "learning_rate": 5.5388901236879506e-08, "loss": 3.9892, "step": 3470 }, { "epoch": 2.7438735177865614, "grad_norm": 1.944604516029358, "learning_rate": 5.5051704025739115e-08, "loss": 4.1755, "step": 3471 }, { "epoch": 2.7446640316205535, "grad_norm": 2.118901014328003, "learning_rate": 5.471551715946066e-08, "loss": 4.3497, "step": 3472 }, { "epoch": 2.7454545454545456, "grad_norm": 1.9873863458633423, "learning_rate": 5.438034087311411e-08, "loss": 4.2248, "step": 3473 }, { "epoch": 2.7462450592885377, "grad_norm": 2.0793709754943848, "learning_rate": 5.4046175401063826e-08, "loss": 4.2292, "step": 3474 }, { "epoch": 2.7470355731225298, "grad_norm": 1.956039547920227, "learning_rate": 5.371302097696656e-08, "loss": 4.1786, "step": 3475 }, { "epoch": 2.747826086956522, "grad_norm": 2.3298521041870117, "learning_rate": 5.338087783377266e-08, "loss": 3.6873, "step": 3476 }, { "epoch": 2.748616600790514, "grad_norm": 2.4346566200256348, "learning_rate": 5.304974620372466e-08, "loss": 3.7686, "step": 3477 }, { "epoch": 2.749407114624506, "grad_norm": 2.2589175701141357, "learning_rate": 5.271962631835886e-08, "loss": 3.9901, "step": 3478 }, { "epoch": 2.750197628458498, "grad_norm": 2.182697296142578, "learning_rate": 5.2390518408503285e-08, "loss": 4.1563, "step": 3479 }, { "epoch": 2.7509881422924902, "grad_norm": 2.286679744720459, "learning_rate": 5.206242270427852e-08, "loss": 3.6209, "step": 3480 }, { "epoch": 2.7517786561264823, "grad_norm": 2.139998435974121, "learning_rate": 5.173533943509756e-08, "loss": 3.8138, "step": 3481 }, { "epoch": 2.7525691699604744, "grad_norm": 2.1652188301086426, "learning_rate": 5.140926882966529e-08, "loss": 4.0418, "step": 3482 }, { "epoch": 2.7533596837944665, "grad_norm": 2.106252670288086, "learning_rate": 5.108421111597883e-08, "loss": 3.9764, "step": 3483 }, { "epoch": 2.7541501976284586, "grad_norm": 2.5008890628814697, "learning_rate": 5.0760166521327033e-08, "loss": 3.7003, "step": 3484 }, { "epoch": 2.7549407114624507, "grad_norm": 2.1640071868896484, "learning_rate": 5.043713527228949e-08, "loss": 3.8477, "step": 3485 }, { "epoch": 2.7557312252964428, "grad_norm": 2.374685287475586, "learning_rate": 5.011511759473852e-08, "loss": 3.9159, "step": 3486 }, { "epoch": 2.756521739130435, "grad_norm": 2.200883626937866, "learning_rate": 4.9794113713837184e-08, "loss": 3.9339, "step": 3487 }, { "epoch": 2.757312252964427, "grad_norm": 2.219926118850708, "learning_rate": 4.9474123854039266e-08, "loss": 3.6094, "step": 3488 }, { "epoch": 2.758102766798419, "grad_norm": 2.5233023166656494, "learning_rate": 4.9155148239089786e-08, "loss": 4.0438, "step": 3489 }, { "epoch": 2.758893280632411, "grad_norm": 2.1447174549102783, "learning_rate": 4.883718709202534e-08, "loss": 3.7846, "step": 3490 }, { "epoch": 2.759683794466403, "grad_norm": 1.926173210144043, "learning_rate": 4.852024063517174e-08, "loss": 4.2022, "step": 3491 }, { "epoch": 2.7604743083003953, "grad_norm": 1.9995479583740234, "learning_rate": 4.820430909014639e-08, "loss": 4.3115, "step": 3492 }, { "epoch": 2.7612648221343874, "grad_norm": 2.2520549297332764, "learning_rate": 4.7889392677856915e-08, "loss": 3.986, "step": 3493 }, { "epoch": 2.7620553359683795, "grad_norm": 2.268608570098877, "learning_rate": 4.75754916185005e-08, "loss": 3.9053, "step": 3494 }, { "epoch": 2.7628458498023716, "grad_norm": 2.088453531265259, "learning_rate": 4.7262606131565245e-08, "loss": 3.9874, "step": 3495 }, { "epoch": 2.7636363636363637, "grad_norm": 3.098453998565674, "learning_rate": 4.695073643582848e-08, "loss": 3.6897, "step": 3496 }, { "epoch": 2.7644268774703558, "grad_norm": 2.1479506492614746, "learning_rate": 4.6639882749357265e-08, "loss": 3.9439, "step": 3497 }, { "epoch": 2.765217391304348, "grad_norm": 2.1677448749542236, "learning_rate": 4.63300452895089e-08, "loss": 4.0835, "step": 3498 }, { "epoch": 2.76600790513834, "grad_norm": 2.4556970596313477, "learning_rate": 4.60212242729296e-08, "loss": 3.4252, "step": 3499 }, { "epoch": 2.766798418972332, "grad_norm": 2.082259178161621, "learning_rate": 4.571341991555461e-08, "loss": 3.8237, "step": 3500 }, { "epoch": 2.767588932806324, "grad_norm": 1.949383020401001, "learning_rate": 4.540663243260878e-08, "loss": 4.3077, "step": 3501 }, { "epoch": 2.768379446640316, "grad_norm": 2.166334629058838, "learning_rate": 4.510086203860586e-08, "loss": 3.8848, "step": 3502 }, { "epoch": 2.7691699604743083, "grad_norm": 2.2718710899353027, "learning_rate": 4.479610894734831e-08, "loss": 3.7969, "step": 3503 }, { "epoch": 2.7699604743083004, "grad_norm": 2.3814938068389893, "learning_rate": 4.449237337192735e-08, "loss": 4.1407, "step": 3504 }, { "epoch": 2.7707509881422925, "grad_norm": 2.198599338531494, "learning_rate": 4.4189655524722425e-08, "loss": 4.0461, "step": 3505 }, { "epoch": 2.7715415019762846, "grad_norm": 2.0810272693634033, "learning_rate": 4.388795561740189e-08, "loss": 4.2669, "step": 3506 }, { "epoch": 2.7723320158102767, "grad_norm": 2.219601631164551, "learning_rate": 4.358727386092198e-08, "loss": 3.793, "step": 3507 }, { "epoch": 2.7731225296442688, "grad_norm": 1.9860135316848755, "learning_rate": 4.3287610465527036e-08, "loss": 4.3927, "step": 3508 }, { "epoch": 2.773913043478261, "grad_norm": 2.0856997966766357, "learning_rate": 4.298896564074944e-08, "loss": 4.064, "step": 3509 }, { "epoch": 2.774703557312253, "grad_norm": 2.4872219562530518, "learning_rate": 4.269133959540899e-08, "loss": 3.8458, "step": 3510 }, { "epoch": 2.775494071146245, "grad_norm": 2.1537063121795654, "learning_rate": 4.239473253761406e-08, "loss": 3.8629, "step": 3511 }, { "epoch": 2.776284584980237, "grad_norm": 2.1783907413482666, "learning_rate": 4.2099144674759394e-08, "loss": 4.2754, "step": 3512 }, { "epoch": 2.777075098814229, "grad_norm": 2.0274770259857178, "learning_rate": 4.180457621352768e-08, "loss": 4.1001, "step": 3513 }, { "epoch": 2.7778656126482213, "grad_norm": 2.2340705394744873, "learning_rate": 4.151102735988882e-08, "loss": 4.0849, "step": 3514 }, { "epoch": 2.7786561264822134, "grad_norm": 2.110105037689209, "learning_rate": 4.1218498319099796e-08, "loss": 4.1855, "step": 3515 }, { "epoch": 2.7794466403162055, "grad_norm": 2.1685731410980225, "learning_rate": 4.0926989295704323e-08, "loss": 4.215, "step": 3516 }, { "epoch": 2.7802371541501976, "grad_norm": 2.2036097049713135, "learning_rate": 4.063650049353268e-08, "loss": 4.2413, "step": 3517 }, { "epoch": 2.7810276679841897, "grad_norm": 2.235219717025757, "learning_rate": 4.03470321157024e-08, "loss": 3.8716, "step": 3518 }, { "epoch": 2.7818181818181817, "grad_norm": 2.1926565170288086, "learning_rate": 4.0058584364617404e-08, "loss": 4.0026, "step": 3519 }, { "epoch": 2.782608695652174, "grad_norm": 2.0635011196136475, "learning_rate": 3.977115744196719e-08, "loss": 3.9631, "step": 3520 }, { "epoch": 2.783399209486166, "grad_norm": 2.186377763748169, "learning_rate": 3.948475154872833e-08, "loss": 3.8918, "step": 3521 }, { "epoch": 2.784189723320158, "grad_norm": 2.224841833114624, "learning_rate": 3.919936688516329e-08, "loss": 4.1938, "step": 3522 }, { "epoch": 2.78498023715415, "grad_norm": 2.5754077434539795, "learning_rate": 3.891500365081996e-08, "loss": 3.9728, "step": 3523 }, { "epoch": 2.785770750988142, "grad_norm": 2.105949878692627, "learning_rate": 3.863166204453278e-08, "loss": 4.2003, "step": 3524 }, { "epoch": 2.7865612648221343, "grad_norm": 2.1585500240325928, "learning_rate": 3.8349342264421436e-08, "loss": 4.0119, "step": 3525 }, { "epoch": 2.7873517786561264, "grad_norm": 2.111158609390259, "learning_rate": 3.80680445078907e-08, "loss": 3.9829, "step": 3526 }, { "epoch": 2.7881422924901185, "grad_norm": 2.1195547580718994, "learning_rate": 3.778776897163155e-08, "loss": 4.0717, "step": 3527 }, { "epoch": 2.7889328063241106, "grad_norm": 1.999621868133545, "learning_rate": 3.7508515851619905e-08, "loss": 4.1508, "step": 3528 }, { "epoch": 2.7897233201581026, "grad_norm": 2.2307193279266357, "learning_rate": 3.723028534311657e-08, "loss": 4.105, "step": 3529 }, { "epoch": 2.7905138339920947, "grad_norm": 2.0347084999084473, "learning_rate": 3.6953077640667076e-08, "loss": 4.255, "step": 3530 }, { "epoch": 2.791304347826087, "grad_norm": 2.1582701206207275, "learning_rate": 3.667689293810289e-08, "loss": 4.217, "step": 3531 }, { "epoch": 2.792094861660079, "grad_norm": 2.045454263687134, "learning_rate": 3.6401731428538864e-08, "loss": 4.2449, "step": 3532 }, { "epoch": 2.792885375494071, "grad_norm": 1.9791374206542969, "learning_rate": 3.612759330437509e-08, "loss": 4.1548, "step": 3533 }, { "epoch": 2.793675889328063, "grad_norm": 2.111781358718872, "learning_rate": 3.5854478757296085e-08, "loss": 4.0042, "step": 3534 }, { "epoch": 2.794466403162055, "grad_norm": 2.202965021133423, "learning_rate": 3.558238797827057e-08, "loss": 3.7975, "step": 3535 }, { "epoch": 2.7952569169960473, "grad_norm": 2.2946858406066895, "learning_rate": 3.53113211575512e-08, "loss": 3.8771, "step": 3536 }, { "epoch": 2.7960474308300394, "grad_norm": 2.0812923908233643, "learning_rate": 3.504127848467503e-08, "loss": 4.0917, "step": 3537 }, { "epoch": 2.7968379446640315, "grad_norm": 2.0617213249206543, "learning_rate": 3.4772260148462355e-08, "loss": 4.1545, "step": 3538 }, { "epoch": 2.7976284584980236, "grad_norm": 2.1657118797302246, "learning_rate": 3.4504266337018364e-08, "loss": 4.143, "step": 3539 }, { "epoch": 2.7984189723320156, "grad_norm": 2.123257637023926, "learning_rate": 3.4237297237731e-08, "loss": 4.0178, "step": 3540 }, { "epoch": 2.799209486166008, "grad_norm": 2.114448308944702, "learning_rate": 3.397135303727161e-08, "loss": 4.1836, "step": 3541 }, { "epoch": 2.8, "grad_norm": 2.0387725830078125, "learning_rate": 3.370643392159528e-08, "loss": 4.1131, "step": 3542 }, { "epoch": 2.8007905138339924, "grad_norm": 2.3498897552490234, "learning_rate": 3.344254007594033e-08, "loss": 3.9057, "step": 3543 }, { "epoch": 2.801581027667984, "grad_norm": 2.1483449935913086, "learning_rate": 3.3179671684828164e-08, "loss": 4.0753, "step": 3544 }, { "epoch": 2.8023715415019765, "grad_norm": 2.5285580158233643, "learning_rate": 3.291782893206308e-08, "loss": 3.7841, "step": 3545 }, { "epoch": 2.803162055335968, "grad_norm": 2.2497544288635254, "learning_rate": 3.265701200073196e-08, "loss": 3.5327, "step": 3546 }, { "epoch": 2.8039525691699607, "grad_norm": 2.173307418823242, "learning_rate": 3.239722107320492e-08, "loss": 4.1107, "step": 3547 }, { "epoch": 2.8047430830039524, "grad_norm": 1.8760836124420166, "learning_rate": 3.213845633113466e-08, "loss": 4.2347, "step": 3548 }, { "epoch": 2.805533596837945, "grad_norm": 2.1600847244262695, "learning_rate": 3.1880717955455605e-08, "loss": 3.9637, "step": 3549 }, { "epoch": 2.8063241106719365, "grad_norm": 2.1761181354522705, "learning_rate": 3.162400612638527e-08, "loss": 3.8855, "step": 3550 }, { "epoch": 2.807114624505929, "grad_norm": 2.290501832962036, "learning_rate": 3.1368321023423406e-08, "loss": 3.9266, "step": 3551 }, { "epoch": 2.8079051383399207, "grad_norm": 2.299513578414917, "learning_rate": 3.111366282535116e-08, "loss": 3.6474, "step": 3552 }, { "epoch": 2.8086956521739133, "grad_norm": 2.2224957942962646, "learning_rate": 3.0860031710232275e-08, "loss": 3.809, "step": 3553 }, { "epoch": 2.809486166007905, "grad_norm": 2.036870002746582, "learning_rate": 3.060742785541204e-08, "loss": 4.3139, "step": 3554 }, { "epoch": 2.8102766798418974, "grad_norm": 2.258730173110962, "learning_rate": 3.0355851437517676e-08, "loss": 4.0447, "step": 3555 }, { "epoch": 2.811067193675889, "grad_norm": 2.2809770107269287, "learning_rate": 3.0105302632457645e-08, "loss": 3.9921, "step": 3556 }, { "epoch": 2.8118577075098816, "grad_norm": 2.0963711738586426, "learning_rate": 2.985578161542246e-08, "loss": 4.1904, "step": 3557 }, { "epoch": 2.8126482213438733, "grad_norm": 2.1753132343292236, "learning_rate": 2.9607288560883073e-08, "loss": 3.5492, "step": 3558 }, { "epoch": 2.813438735177866, "grad_norm": 2.0585570335388184, "learning_rate": 2.9359823642592654e-08, "loss": 4.3104, "step": 3559 }, { "epoch": 2.8142292490118574, "grad_norm": 1.9283596277236938, "learning_rate": 2.911338703358496e-08, "loss": 4.3409, "step": 3560 }, { "epoch": 2.81501976284585, "grad_norm": 2.2064156532287598, "learning_rate": 2.8867978906174485e-08, "loss": 3.8494, "step": 3561 }, { "epoch": 2.8158102766798416, "grad_norm": 2.159242630004883, "learning_rate": 2.8623599431957137e-08, "loss": 4.21, "step": 3562 }, { "epoch": 2.816600790513834, "grad_norm": 2.009911060333252, "learning_rate": 2.8380248781809226e-08, "loss": 4.2203, "step": 3563 }, { "epoch": 2.8173913043478263, "grad_norm": 2.2611515522003174, "learning_rate": 2.8137927125887807e-08, "loss": 3.5179, "step": 3564 }, { "epoch": 2.8181818181818183, "grad_norm": 2.1946117877960205, "learning_rate": 2.7896634633630346e-08, "loss": 4.038, "step": 3565 }, { "epoch": 2.8189723320158104, "grad_norm": 2.1403415203094482, "learning_rate": 2.765637147375455e-08, "loss": 3.7169, "step": 3566 }, { "epoch": 2.8197628458498025, "grad_norm": 2.2514517307281494, "learning_rate": 2.741713781425903e-08, "loss": 4.1063, "step": 3567 }, { "epoch": 2.8205533596837946, "grad_norm": 2.214693307876587, "learning_rate": 2.7178933822421647e-08, "loss": 3.8428, "step": 3568 }, { "epoch": 2.8213438735177867, "grad_norm": 2.3222057819366455, "learning_rate": 2.6941759664801002e-08, "loss": 3.9831, "step": 3569 }, { "epoch": 2.822134387351779, "grad_norm": 2.073532819747925, "learning_rate": 2.6705615507235103e-08, "loss": 4.1837, "step": 3570 }, { "epoch": 2.822924901185771, "grad_norm": 2.2038674354553223, "learning_rate": 2.6470501514842205e-08, "loss": 4.2761, "step": 3571 }, { "epoch": 2.823715415019763, "grad_norm": 2.441739797592163, "learning_rate": 2.6236417852019968e-08, "loss": 3.7992, "step": 3572 }, { "epoch": 2.824505928853755, "grad_norm": 2.387259006500244, "learning_rate": 2.6003364682445795e-08, "loss": 3.812, "step": 3573 }, { "epoch": 2.825296442687747, "grad_norm": 2.3097469806671143, "learning_rate": 2.5771342169076173e-08, "loss": 3.9734, "step": 3574 }, { "epoch": 2.8260869565217392, "grad_norm": 2.0317630767822266, "learning_rate": 2.5540350474147324e-08, "loss": 4.3934, "step": 3575 }, { "epoch": 2.8268774703557313, "grad_norm": 2.2182729244232178, "learning_rate": 2.5310389759174545e-08, "loss": 3.9661, "step": 3576 }, { "epoch": 2.8276679841897234, "grad_norm": 2.119412899017334, "learning_rate": 2.5081460184952555e-08, "loss": 4.0018, "step": 3577 }, { "epoch": 2.8284584980237155, "grad_norm": 2.477691411972046, "learning_rate": 2.48535619115543e-08, "loss": 3.8542, "step": 3578 }, { "epoch": 2.8292490118577076, "grad_norm": 2.0318329334259033, "learning_rate": 2.4626695098332152e-08, "loss": 4.4118, "step": 3579 }, { "epoch": 2.8300395256916997, "grad_norm": 1.996160626411438, "learning_rate": 2.440085990391755e-08, "loss": 4.0053, "step": 3580 }, { "epoch": 2.830830039525692, "grad_norm": 2.2162420749664307, "learning_rate": 2.4176056486220012e-08, "loss": 3.7607, "step": 3581 }, { "epoch": 2.831620553359684, "grad_norm": 2.126607894897461, "learning_rate": 2.3952285002427798e-08, "loss": 4.076, "step": 3582 }, { "epoch": 2.832411067193676, "grad_norm": 2.151864767074585, "learning_rate": 2.3729545609007585e-08, "loss": 3.7882, "step": 3583 }, { "epoch": 2.833201581027668, "grad_norm": 2.0553152561187744, "learning_rate": 2.3507838461704788e-08, "loss": 4.0641, "step": 3584 }, { "epoch": 2.83399209486166, "grad_norm": 2.222590684890747, "learning_rate": 2.328716371554257e-08, "loss": 3.864, "step": 3585 }, { "epoch": 2.8347826086956522, "grad_norm": 1.9622046947479248, "learning_rate": 2.3067521524822333e-08, "loss": 4.4495, "step": 3586 }, { "epoch": 2.8355731225296443, "grad_norm": 2.190237522125244, "learning_rate": 2.2848912043123394e-08, "loss": 4.0656, "step": 3587 }, { "epoch": 2.8363636363636364, "grad_norm": 2.184952974319458, "learning_rate": 2.2631335423303313e-08, "loss": 4.1509, "step": 3588 }, { "epoch": 2.8371541501976285, "grad_norm": 2.1085264682769775, "learning_rate": 2.2414791817497394e-08, "loss": 3.9447, "step": 3589 }, { "epoch": 2.8379446640316206, "grad_norm": 2.072410821914673, "learning_rate": 2.219928137711802e-08, "loss": 4.1543, "step": 3590 }, { "epoch": 2.8387351778656127, "grad_norm": 2.2808103561401367, "learning_rate": 2.1984804252855652e-08, "loss": 4.0879, "step": 3591 }, { "epoch": 2.839525691699605, "grad_norm": 2.163963556289673, "learning_rate": 2.1771360594678657e-08, "loss": 3.7859, "step": 3592 }, { "epoch": 2.840316205533597, "grad_norm": 2.079807996749878, "learning_rate": 2.155895055183199e-08, "loss": 4.0057, "step": 3593 }, { "epoch": 2.841106719367589, "grad_norm": 2.2001893520355225, "learning_rate": 2.1347574272838177e-08, "loss": 3.99, "step": 3594 }, { "epoch": 2.841897233201581, "grad_norm": 2.334801435470581, "learning_rate": 2.113723190549699e-08, "loss": 3.8266, "step": 3595 }, { "epoch": 2.842687747035573, "grad_norm": 2.505524158477783, "learning_rate": 2.0927923596885112e-08, "loss": 4.1125, "step": 3596 }, { "epoch": 2.8434782608695652, "grad_norm": 2.164391279220581, "learning_rate": 2.0719649493356474e-08, "loss": 4.2253, "step": 3597 }, { "epoch": 2.8442687747035573, "grad_norm": 2.055142879486084, "learning_rate": 2.0512409740541414e-08, "loss": 4.0637, "step": 3598 }, { "epoch": 2.8450592885375494, "grad_norm": 2.3992207050323486, "learning_rate": 2.0306204483347356e-08, "loss": 3.835, "step": 3599 }, { "epoch": 2.8458498023715415, "grad_norm": 2.2682101726531982, "learning_rate": 2.0101033865958462e-08, "loss": 3.9702, "step": 3600 }, { "epoch": 2.8466403162055336, "grad_norm": 2.2102932929992676, "learning_rate": 1.989689803183531e-08, "loss": 3.9422, "step": 3601 }, { "epoch": 2.8474308300395257, "grad_norm": 2.5119943618774414, "learning_rate": 1.969379712371455e-08, "loss": 3.8121, "step": 3602 }, { "epoch": 2.8482213438735178, "grad_norm": 2.122117519378662, "learning_rate": 1.949173128360976e-08, "loss": 4.0942, "step": 3603 }, { "epoch": 2.84901185770751, "grad_norm": 2.480971336364746, "learning_rate": 1.9290700652810576e-08, "loss": 3.8307, "step": 3604 }, { "epoch": 2.849802371541502, "grad_norm": 2.420750617980957, "learning_rate": 1.9090705371882567e-08, "loss": 4.0864, "step": 3605 }, { "epoch": 2.850592885375494, "grad_norm": 2.0183682441711426, "learning_rate": 1.889174558066803e-08, "loss": 4.1667, "step": 3606 }, { "epoch": 2.851383399209486, "grad_norm": 2.273634433746338, "learning_rate": 1.8693821418284183e-08, "loss": 4.0988, "step": 3607 }, { "epoch": 2.8521739130434782, "grad_norm": 2.1035311222076416, "learning_rate": 1.8496933023124994e-08, "loss": 4.1331, "step": 3608 }, { "epoch": 2.8529644268774703, "grad_norm": 2.170426368713379, "learning_rate": 1.8301080532859836e-08, "loss": 3.9563, "step": 3609 }, { "epoch": 2.8537549407114624, "grad_norm": 2.154207229614258, "learning_rate": 1.8106264084433665e-08, "loss": 3.8232, "step": 3610 }, { "epoch": 2.8545454545454545, "grad_norm": 2.19111967086792, "learning_rate": 1.791248381406735e-08, "loss": 3.9461, "step": 3611 }, { "epoch": 2.8553359683794466, "grad_norm": 2.212050676345825, "learning_rate": 1.7719739857256832e-08, "loss": 3.86, "step": 3612 }, { "epoch": 2.8561264822134387, "grad_norm": 2.049811840057373, "learning_rate": 1.7528032348773648e-08, "loss": 4.0365, "step": 3613 }, { "epoch": 2.8569169960474308, "grad_norm": 2.3694608211517334, "learning_rate": 1.7337361422664565e-08, "loss": 3.7295, "step": 3614 }, { "epoch": 2.857707509881423, "grad_norm": 2.118088960647583, "learning_rate": 1.714772721225161e-08, "loss": 4.1643, "step": 3615 }, { "epoch": 2.858498023715415, "grad_norm": 2.1476428508758545, "learning_rate": 1.6959129850132037e-08, "loss": 4.1014, "step": 3616 }, { "epoch": 2.859288537549407, "grad_norm": 2.522303581237793, "learning_rate": 1.6771569468177872e-08, "loss": 3.7529, "step": 3617 }, { "epoch": 2.860079051383399, "grad_norm": 2.43945050239563, "learning_rate": 1.6585046197536035e-08, "loss": 3.8496, "step": 3618 }, { "epoch": 2.860869565217391, "grad_norm": 2.1111273765563965, "learning_rate": 1.6399560168628368e-08, "loss": 4.3838, "step": 3619 }, { "epoch": 2.8616600790513833, "grad_norm": 2.3156611919403076, "learning_rate": 1.6215111511151626e-08, "loss": 4.1267, "step": 3620 }, { "epoch": 2.8624505928853754, "grad_norm": 2.1334891319274902, "learning_rate": 1.6031700354076972e-08, "loss": 3.7213, "step": 3621 }, { "epoch": 2.8632411067193675, "grad_norm": 2.099215269088745, "learning_rate": 1.584932682564999e-08, "loss": 4.0888, "step": 3622 }, { "epoch": 2.8640316205533596, "grad_norm": 2.22739315032959, "learning_rate": 1.5667991053391172e-08, "loss": 3.9305, "step": 3623 }, { "epoch": 2.8648221343873517, "grad_norm": 2.289052963256836, "learning_rate": 1.548769316409493e-08, "loss": 3.9177, "step": 3624 }, { "epoch": 2.8656126482213438, "grad_norm": 2.056722640991211, "learning_rate": 1.5308433283830414e-08, "loss": 4.078, "step": 3625 }, { "epoch": 2.866403162055336, "grad_norm": 2.144869804382324, "learning_rate": 1.5130211537940363e-08, "loss": 3.7467, "step": 3626 }, { "epoch": 2.867193675889328, "grad_norm": 2.545809268951416, "learning_rate": 1.4953028051042428e-08, "loss": 3.6008, "step": 3627 }, { "epoch": 2.86798418972332, "grad_norm": 2.201687812805176, "learning_rate": 1.4776882947027504e-08, "loss": 3.9815, "step": 3628 }, { "epoch": 2.868774703557312, "grad_norm": 2.2563984394073486, "learning_rate": 1.4601776349061069e-08, "loss": 3.785, "step": 3629 }, { "epoch": 2.869565217391304, "grad_norm": 1.9048781394958496, "learning_rate": 1.442770837958185e-08, "loss": 4.3818, "step": 3630 }, { "epoch": 2.8703557312252963, "grad_norm": 1.9631173610687256, "learning_rate": 1.4254679160302652e-08, "loss": 4.1217, "step": 3631 }, { "epoch": 2.8711462450592884, "grad_norm": 2.2945590019226074, "learning_rate": 1.4082688812210032e-08, "loss": 3.8088, "step": 3632 }, { "epoch": 2.8719367588932805, "grad_norm": 2.533334970474243, "learning_rate": 1.3911737455564122e-08, "loss": 3.5845, "step": 3633 }, { "epoch": 2.8727272727272726, "grad_norm": 2.410271406173706, "learning_rate": 1.3741825209898307e-08, "loss": 3.6805, "step": 3634 }, { "epoch": 2.8735177865612647, "grad_norm": 3.050248384475708, "learning_rate": 1.3572952194019717e-08, "loss": 3.9197, "step": 3635 }, { "epoch": 2.8743083003952568, "grad_norm": 2.135913610458374, "learning_rate": 1.3405118526008563e-08, "loss": 3.7872, "step": 3636 }, { "epoch": 2.875098814229249, "grad_norm": 2.1321380138397217, "learning_rate": 1.3238324323218642e-08, "loss": 4.0531, "step": 3637 }, { "epoch": 2.8758893280632414, "grad_norm": 2.3581035137176514, "learning_rate": 1.307256970227666e-08, "loss": 4.034, "step": 3638 }, { "epoch": 2.876679841897233, "grad_norm": 2.088820219039917, "learning_rate": 1.2907854779082406e-08, "loss": 4.0655, "step": 3639 }, { "epoch": 2.8774703557312256, "grad_norm": 2.1774399280548096, "learning_rate": 1.2744179668808587e-08, "loss": 3.9253, "step": 3640 }, { "epoch": 2.878260869565217, "grad_norm": 2.1512033939361572, "learning_rate": 1.2581544485901653e-08, "loss": 3.9876, "step": 3641 }, { "epoch": 2.8790513833992097, "grad_norm": 2.3014330863952637, "learning_rate": 1.241994934407964e-08, "loss": 4.1412, "step": 3642 }, { "epoch": 2.8798418972332014, "grad_norm": 2.2419559955596924, "learning_rate": 1.225939435633433e-08, "loss": 4.193, "step": 3643 }, { "epoch": 2.880632411067194, "grad_norm": 2.0805344581604004, "learning_rate": 1.2099879634929922e-08, "loss": 4.3803, "step": 3644 }, { "epoch": 2.8814229249011856, "grad_norm": 2.4223079681396484, "learning_rate": 1.1941405291403029e-08, "loss": 3.7466, "step": 3645 }, { "epoch": 2.882213438735178, "grad_norm": 2.1694014072418213, "learning_rate": 1.1783971436563013e-08, "loss": 4.0416, "step": 3646 }, { "epoch": 2.8830039525691697, "grad_norm": 2.164515256881714, "learning_rate": 1.1627578180491815e-08, "loss": 3.8373, "step": 3647 }, { "epoch": 2.8837944664031623, "grad_norm": 2.143866539001465, "learning_rate": 1.1472225632543298e-08, "loss": 4.2141, "step": 3648 }, { "epoch": 2.884584980237154, "grad_norm": 2.015876531600952, "learning_rate": 1.1317913901344235e-08, "loss": 4.1207, "step": 3649 }, { "epoch": 2.8853754940711465, "grad_norm": 2.350457191467285, "learning_rate": 1.1164643094793315e-08, "loss": 3.9225, "step": 3650 }, { "epoch": 2.886166007905138, "grad_norm": 2.194106101989746, "learning_rate": 1.1012413320061144e-08, "loss": 3.9411, "step": 3651 }, { "epoch": 2.8869565217391306, "grad_norm": 2.086512804031372, "learning_rate": 1.086122468359091e-08, "loss": 4.2489, "step": 3652 }, { "epoch": 2.8877470355731223, "grad_norm": 2.2784476280212402, "learning_rate": 1.0711077291097549e-08, "loss": 3.9761, "step": 3653 }, { "epoch": 2.888537549407115, "grad_norm": 2.726656913757324, "learning_rate": 1.0561971247568081e-08, "loss": 3.9756, "step": 3654 }, { "epoch": 2.8893280632411065, "grad_norm": 2.0293476581573486, "learning_rate": 1.041390665726094e-08, "loss": 4.1034, "step": 3655 }, { "epoch": 2.890118577075099, "grad_norm": 2.420776605606079, "learning_rate": 1.026688362370698e-08, "loss": 3.4568, "step": 3656 }, { "epoch": 2.8909090909090907, "grad_norm": 2.2735698223114014, "learning_rate": 1.0120902249708298e-08, "loss": 3.8057, "step": 3657 }, { "epoch": 2.891699604743083, "grad_norm": 2.2075858116149902, "learning_rate": 9.975962637338909e-09, "loss": 3.8126, "step": 3658 }, { "epoch": 2.892490118577075, "grad_norm": 1.9849635362625122, "learning_rate": 9.832064887944414e-09, "loss": 4.2868, "step": 3659 }, { "epoch": 2.8932806324110674, "grad_norm": 2.4022040367126465, "learning_rate": 9.689209102141495e-09, "loss": 4.3417, "step": 3660 }, { "epoch": 2.894071146245059, "grad_norm": 2.027559757232666, "learning_rate": 9.547395379818914e-09, "loss": 4.0576, "step": 3661 }, { "epoch": 2.8948616600790515, "grad_norm": 1.9818648099899292, "learning_rate": 9.406623820136184e-09, "loss": 4.3813, "step": 3662 }, { "epoch": 2.8956521739130436, "grad_norm": 2.0314643383026123, "learning_rate": 9.266894521524572e-09, "loss": 4.3052, "step": 3663 }, { "epoch": 2.8964426877470357, "grad_norm": 2.2388973236083984, "learning_rate": 9.128207581686255e-09, "loss": 4.0445, "step": 3664 }, { "epoch": 2.897233201581028, "grad_norm": 2.2382423877716064, "learning_rate": 8.990563097594662e-09, "loss": 3.5345, "step": 3665 }, { "epoch": 2.89802371541502, "grad_norm": 2.235269546508789, "learning_rate": 8.853961165494474e-09, "loss": 4.0181, "step": 3666 }, { "epoch": 2.898814229249012, "grad_norm": 2.0100526809692383, "learning_rate": 8.718401880901283e-09, "loss": 4.0851, "step": 3667 }, { "epoch": 2.899604743083004, "grad_norm": 3.453319787979126, "learning_rate": 8.583885338601438e-09, "loss": 4.0701, "step": 3668 }, { "epoch": 2.900395256916996, "grad_norm": 2.431528091430664, "learning_rate": 8.450411632652533e-09, "loss": 3.9435, "step": 3669 }, { "epoch": 2.9011857707509883, "grad_norm": 2.135128974914551, "learning_rate": 8.317980856382746e-09, "loss": 4.1745, "step": 3670 }, { "epoch": 2.9019762845849804, "grad_norm": 1.9437775611877441, "learning_rate": 8.186593102391005e-09, "loss": 4.3366, "step": 3671 }, { "epoch": 2.9027667984189724, "grad_norm": 2.0464460849761963, "learning_rate": 8.056248462547155e-09, "loss": 4.2027, "step": 3672 }, { "epoch": 2.9035573122529645, "grad_norm": 2.3450684547424316, "learning_rate": 7.926947027991294e-09, "loss": 3.7533, "step": 3673 }, { "epoch": 2.9043478260869566, "grad_norm": 2.2543675899505615, "learning_rate": 7.798688889134264e-09, "loss": 3.923, "step": 3674 }, { "epoch": 2.9051383399209487, "grad_norm": 2.2679543495178223, "learning_rate": 7.671474135657664e-09, "loss": 4.1268, "step": 3675 }, { "epoch": 2.905928853754941, "grad_norm": 2.3205878734588623, "learning_rate": 7.545302856513336e-09, "loss": 4.3087, "step": 3676 }, { "epoch": 2.906719367588933, "grad_norm": 2.1851110458374023, "learning_rate": 7.420175139923213e-09, "loss": 3.675, "step": 3677 }, { "epoch": 2.907509881422925, "grad_norm": 2.1511826515197754, "learning_rate": 7.2960910733799735e-09, "loss": 4.1919, "step": 3678 }, { "epoch": 2.908300395256917, "grad_norm": 2.100112199783325, "learning_rate": 7.173050743646381e-09, "loss": 3.9487, "step": 3679 }, { "epoch": 2.909090909090909, "grad_norm": 2.233828067779541, "learning_rate": 7.051054236755283e-09, "loss": 4.01, "step": 3680 }, { "epoch": 2.9098814229249013, "grad_norm": 2.854435443878174, "learning_rate": 6.930101638009612e-09, "loss": 3.7807, "step": 3681 }, { "epoch": 2.9106719367588934, "grad_norm": 2.284416437149048, "learning_rate": 6.810193031983047e-09, "loss": 3.5751, "step": 3682 }, { "epoch": 2.9114624505928854, "grad_norm": 2.224388599395752, "learning_rate": 6.691328502518357e-09, "loss": 4.0111, "step": 3683 }, { "epoch": 2.9122529644268775, "grad_norm": 2.1668715476989746, "learning_rate": 6.573508132728723e-09, "loss": 3.8814, "step": 3684 }, { "epoch": 2.9130434782608696, "grad_norm": 2.3057398796081543, "learning_rate": 6.4567320049972455e-09, "loss": 3.8245, "step": 3685 }, { "epoch": 2.9138339920948617, "grad_norm": 2.6241631507873535, "learning_rate": 6.3410002009766095e-09, "loss": 3.8099, "step": 3686 }, { "epoch": 2.914624505928854, "grad_norm": 2.307657241821289, "learning_rate": 6.226312801589584e-09, "loss": 3.7441, "step": 3687 }, { "epoch": 2.915415019762846, "grad_norm": 2.1031582355499268, "learning_rate": 6.112669887028688e-09, "loss": 4.0621, "step": 3688 }, { "epoch": 2.916205533596838, "grad_norm": 2.38348650932312, "learning_rate": 6.0000715367556915e-09, "loss": 3.9598, "step": 3689 }, { "epoch": 2.91699604743083, "grad_norm": 2.095318078994751, "learning_rate": 5.8885178295022824e-09, "loss": 4.1873, "step": 3690 }, { "epoch": 2.917786561264822, "grad_norm": 2.269190549850464, "learning_rate": 5.7780088432699e-09, "loss": 3.849, "step": 3691 }, { "epoch": 2.9185770750988143, "grad_norm": 2.1324172019958496, "learning_rate": 5.668544655329066e-09, "loss": 3.9069, "step": 3692 }, { "epoch": 2.9193675889328063, "grad_norm": 2.3008220195770264, "learning_rate": 5.560125342219891e-09, "loss": 3.9023, "step": 3693 }, { "epoch": 2.9201581027667984, "grad_norm": 2.0852489471435547, "learning_rate": 5.452750979752064e-09, "loss": 3.8929, "step": 3694 }, { "epoch": 2.9209486166007905, "grad_norm": 2.1566145420074463, "learning_rate": 5.3464216430045285e-09, "loss": 3.7971, "step": 3695 }, { "epoch": 2.9217391304347826, "grad_norm": 2.311115264892578, "learning_rate": 5.241137406325314e-09, "loss": 3.918, "step": 3696 }, { "epoch": 2.9225296442687747, "grad_norm": 2.210615634918213, "learning_rate": 5.136898343331864e-09, "loss": 3.7186, "step": 3697 }, { "epoch": 2.923320158102767, "grad_norm": 2.1629834175109863, "learning_rate": 5.033704526911042e-09, "loss": 4.1007, "step": 3698 }, { "epoch": 2.924110671936759, "grad_norm": 2.0084080696105957, "learning_rate": 4.931556029218465e-09, "loss": 4.3295, "step": 3699 }, { "epoch": 2.924901185770751, "grad_norm": 2.184882164001465, "learning_rate": 4.8304529216788294e-09, "loss": 3.624, "step": 3700 }, { "epoch": 2.925691699604743, "grad_norm": 2.1491193771362305, "learning_rate": 4.730395274986088e-09, "loss": 4.0361, "step": 3701 }, { "epoch": 2.926482213438735, "grad_norm": 2.1480321884155273, "learning_rate": 4.631383159103442e-09, "loss": 4.1287, "step": 3702 }, { "epoch": 2.9272727272727272, "grad_norm": 2.292588949203491, "learning_rate": 4.533416643262178e-09, "loss": 3.5916, "step": 3703 }, { "epoch": 2.9280632411067193, "grad_norm": 2.4070181846618652, "learning_rate": 4.4364957959633355e-09, "loss": 3.8318, "step": 3704 }, { "epoch": 2.9288537549407114, "grad_norm": 1.8900561332702637, "learning_rate": 4.340620684976371e-09, "loss": 4.4303, "step": 3705 }, { "epoch": 2.9296442687747035, "grad_norm": 2.285658359527588, "learning_rate": 4.2457913773396585e-09, "loss": 4.0054, "step": 3706 }, { "epoch": 2.9304347826086956, "grad_norm": 1.992883563041687, "learning_rate": 4.152007939360158e-09, "loss": 4.1152, "step": 3707 }, { "epoch": 2.9312252964426877, "grad_norm": 2.199526309967041, "learning_rate": 4.059270436613749e-09, "loss": 4.0566, "step": 3708 }, { "epoch": 2.93201581027668, "grad_norm": 2.1821181774139404, "learning_rate": 3.9675789339448935e-09, "loss": 4.1684, "step": 3709 }, { "epoch": 2.932806324110672, "grad_norm": 2.030928373336792, "learning_rate": 3.876933495466473e-09, "loss": 4.0656, "step": 3710 }, { "epoch": 2.933596837944664, "grad_norm": 2.266331672668457, "learning_rate": 3.787334184560287e-09, "loss": 4.1192, "step": 3711 }, { "epoch": 2.934387351778656, "grad_norm": 2.2579774856567383, "learning_rate": 3.6987810638763885e-09, "loss": 4.1263, "step": 3712 }, { "epoch": 2.935177865612648, "grad_norm": 2.157538652420044, "learning_rate": 3.611274195333414e-09, "loss": 3.8551, "step": 3713 }, { "epoch": 2.9359683794466402, "grad_norm": 1.9892104864120483, "learning_rate": 3.5248136401184184e-09, "loss": 4.1347, "step": 3714 }, { "epoch": 2.9367588932806323, "grad_norm": 2.428123712539673, "learning_rate": 3.4393994586867093e-09, "loss": 4.1124, "step": 3715 }, { "epoch": 2.9375494071146244, "grad_norm": 2.0590643882751465, "learning_rate": 3.3550317107621796e-09, "loss": 3.9092, "step": 3716 }, { "epoch": 2.9383399209486165, "grad_norm": 2.2221388816833496, "learning_rate": 3.2717104553369737e-09, "loss": 3.9984, "step": 3717 }, { "epoch": 2.9391304347826086, "grad_norm": 2.1013576984405518, "learning_rate": 3.189435750671321e-09, "loss": 4.2038, "step": 3718 }, { "epoch": 2.9399209486166007, "grad_norm": 2.1869373321533203, "learning_rate": 3.1082076542940373e-09, "loss": 4.1008, "step": 3719 }, { "epoch": 2.940711462450593, "grad_norm": 2.313753843307495, "learning_rate": 3.0280262230015233e-09, "loss": 3.985, "step": 3720 }, { "epoch": 2.941501976284585, "grad_norm": 2.3213863372802734, "learning_rate": 2.948891512858931e-09, "loss": 3.7184, "step": 3721 }, { "epoch": 2.942292490118577, "grad_norm": 1.9953092336654663, "learning_rate": 2.8708035791993325e-09, "loss": 4.19, "step": 3722 }, { "epoch": 2.943083003952569, "grad_norm": 2.511647939682007, "learning_rate": 2.7937624766235513e-09, "loss": 3.6234, "step": 3723 }, { "epoch": 2.943873517786561, "grad_norm": 2.3263304233551025, "learning_rate": 2.7177682590009965e-09, "loss": 4.1743, "step": 3724 }, { "epoch": 2.9446640316205532, "grad_norm": 2.1373445987701416, "learning_rate": 2.6428209794684963e-09, "loss": 3.9198, "step": 3725 }, { "epoch": 2.9454545454545453, "grad_norm": 2.425595998764038, "learning_rate": 2.568920690431298e-09, "loss": 4.132, "step": 3726 }, { "epoch": 2.9462450592885374, "grad_norm": 2.1239235401153564, "learning_rate": 2.4960674435624e-09, "loss": 3.9798, "step": 3727 }, { "epoch": 2.9470355731225295, "grad_norm": 2.025714874267578, "learning_rate": 2.424261289802554e-09, "loss": 4.0999, "step": 3728 }, { "epoch": 2.9478260869565216, "grad_norm": 2.0788090229034424, "learning_rate": 2.353502279360431e-09, "loss": 3.989, "step": 3729 }, { "epoch": 2.9486166007905137, "grad_norm": 2.122539520263672, "learning_rate": 2.2837904617124537e-09, "loss": 4.2067, "step": 3730 }, { "epoch": 2.9494071146245058, "grad_norm": 2.3937129974365234, "learning_rate": 2.2151258856032973e-09, "loss": 4.0911, "step": 3731 }, { "epoch": 2.950197628458498, "grad_norm": 1.9603692293167114, "learning_rate": 2.147508599044723e-09, "loss": 4.1711, "step": 3732 }, { "epoch": 2.95098814229249, "grad_norm": 2.14597487449646, "learning_rate": 2.0809386493164105e-09, "loss": 4.0877, "step": 3733 }, { "epoch": 2.951778656126482, "grad_norm": 2.1532363891601562, "learning_rate": 2.0154160829659594e-09, "loss": 4.1769, "step": 3734 }, { "epoch": 2.9525691699604746, "grad_norm": 2.3493099212646484, "learning_rate": 1.9509409458082215e-09, "loss": 3.6094, "step": 3735 }, { "epoch": 2.9533596837944662, "grad_norm": 1.9975420236587524, "learning_rate": 1.8875132829261344e-09, "loss": 4.4376, "step": 3736 }, { "epoch": 2.9541501976284588, "grad_norm": 2.145129919052124, "learning_rate": 1.8251331386695546e-09, "loss": 4.098, "step": 3737 }, { "epoch": 2.9549407114624504, "grad_norm": 1.9992343187332153, "learning_rate": 1.7638005566567583e-09, "loss": 4.0315, "step": 3738 }, { "epoch": 2.955731225296443, "grad_norm": 2.4248147010803223, "learning_rate": 1.7035155797726077e-09, "loss": 3.688, "step": 3739 }, { "epoch": 2.9565217391304346, "grad_norm": 2.971385955810547, "learning_rate": 1.644278250170217e-09, "loss": 3.7777, "step": 3740 }, { "epoch": 2.957312252964427, "grad_norm": 2.219632863998413, "learning_rate": 1.5860886092696203e-09, "loss": 3.8043, "step": 3741 }, { "epoch": 2.9581027667984188, "grad_norm": 2.2447054386138916, "learning_rate": 1.5289466977586042e-09, "loss": 3.6728, "step": 3742 }, { "epoch": 2.9588932806324113, "grad_norm": 1.9014555215835571, "learning_rate": 1.4728525555923745e-09, "loss": 4.3318, "step": 3743 }, { "epoch": 2.959683794466403, "grad_norm": 2.4202516078948975, "learning_rate": 1.417806221993223e-09, "loss": 3.8294, "step": 3744 }, { "epoch": 2.9604743083003955, "grad_norm": 2.3762643337249756, "learning_rate": 1.3638077354511946e-09, "loss": 3.8038, "step": 3745 }, { "epoch": 2.961264822134387, "grad_norm": 2.1907665729522705, "learning_rate": 1.3108571337229203e-09, "loss": 3.948, "step": 3746 }, { "epoch": 2.9620553359683797, "grad_norm": 2.06706166267395, "learning_rate": 1.2589544538334497e-09, "loss": 4.1549, "step": 3747 }, { "epoch": 2.9628458498023713, "grad_norm": 2.0846636295318604, "learning_rate": 1.2080997320739195e-09, "loss": 4.1817, "step": 3748 }, { "epoch": 2.963636363636364, "grad_norm": 2.1722583770751953, "learning_rate": 1.1582930040035522e-09, "loss": 3.9641, "step": 3749 }, { "epoch": 2.9644268774703555, "grad_norm": 1.9204069375991821, "learning_rate": 1.1095343044484896e-09, "loss": 4.3336, "step": 3750 }, { "epoch": 2.965217391304348, "grad_norm": 2.3287456035614014, "learning_rate": 1.06182366750196e-09, "loss": 3.7669, "step": 3751 }, { "epoch": 2.9660079051383397, "grad_norm": 2.9903182983398438, "learning_rate": 1.0151611265244442e-09, "loss": 4.0738, "step": 3752 }, { "epoch": 2.966798418972332, "grad_norm": 2.378804922103882, "learning_rate": 9.695467141436765e-10, "loss": 3.6611, "step": 3753 }, { "epoch": 2.967588932806324, "grad_norm": 2.0314648151397705, "learning_rate": 9.249804622543101e-10, "loss": 3.9991, "step": 3754 }, { "epoch": 2.9683794466403164, "grad_norm": 2.149047374725342, "learning_rate": 8.814624020182515e-10, "loss": 4.0114, "step": 3755 }, { "epoch": 2.969169960474308, "grad_norm": 2.1564903259277344, "learning_rate": 8.3899256386466e-10, "loss": 4.0806, "step": 3756 }, { "epoch": 2.9699604743083006, "grad_norm": 2.169509172439575, "learning_rate": 7.975709774892814e-10, "loss": 3.6668, "step": 3757 }, { "epoch": 2.970750988142292, "grad_norm": 2.205223321914673, "learning_rate": 7.571976718551143e-10, "loss": 4.0854, "step": 3758 }, { "epoch": 2.9715415019762847, "grad_norm": 2.201673984527588, "learning_rate": 7.178726751924103e-10, "loss": 3.9815, "step": 3759 }, { "epoch": 2.972332015810277, "grad_norm": 2.2580482959747314, "learning_rate": 6.795960149981739e-10, "loss": 3.9455, "step": 3760 }, { "epoch": 2.973122529644269, "grad_norm": 2.6672136783599854, "learning_rate": 6.423677180361631e-10, "loss": 4.1922, "step": 3761 }, { "epoch": 2.973913043478261, "grad_norm": 2.2721006870269775, "learning_rate": 6.061878103377216e-10, "loss": 3.9445, "step": 3762 }, { "epoch": 2.974703557312253, "grad_norm": 2.2624199390411377, "learning_rate": 5.710563172006133e-10, "loss": 3.9968, "step": 3763 }, { "epoch": 2.975494071146245, "grad_norm": 2.146167278289795, "learning_rate": 5.369732631896885e-10, "loss": 4.0665, "step": 3764 }, { "epoch": 2.9762845849802373, "grad_norm": 2.125028610229492, "learning_rate": 5.039386721365502e-10, "loss": 4.1604, "step": 3765 }, { "epoch": 2.9770750988142294, "grad_norm": 2.0719096660614014, "learning_rate": 4.719525671400549e-10, "loss": 3.8974, "step": 3766 }, { "epoch": 2.9778656126482215, "grad_norm": 1.9529894590377808, "learning_rate": 4.410149705656452e-10, "loss": 4.2126, "step": 3767 }, { "epoch": 2.9786561264822136, "grad_norm": 2.1562442779541016, "learning_rate": 4.111259040455173e-10, "loss": 3.802, "step": 3768 }, { "epoch": 2.9794466403162057, "grad_norm": 2.3606181144714355, "learning_rate": 3.8228538847912e-10, "loss": 3.9539, "step": 3769 }, { "epoch": 2.9802371541501977, "grad_norm": 2.162860870361328, "learning_rate": 3.544934440323222e-10, "loss": 4.1356, "step": 3770 }, { "epoch": 2.98102766798419, "grad_norm": 2.051844358444214, "learning_rate": 3.277500901379127e-10, "loss": 4.281, "step": 3771 }, { "epoch": 2.981818181818182, "grad_norm": 2.1907451152801514, "learning_rate": 3.020553454957664e-10, "loss": 3.7551, "step": 3772 }, { "epoch": 2.982608695652174, "grad_norm": 2.4655940532684326, "learning_rate": 2.77409228072012e-10, "loss": 3.6629, "step": 3773 }, { "epoch": 2.983399209486166, "grad_norm": 2.1938822269439697, "learning_rate": 2.5381175510003073e-10, "loss": 3.8042, "step": 3774 }, { "epoch": 2.984189723320158, "grad_norm": 2.162123441696167, "learning_rate": 2.3126294307979078e-10, "loss": 4.1675, "step": 3775 }, { "epoch": 2.9849802371541503, "grad_norm": 2.294116973876953, "learning_rate": 2.0976280777801336e-10, "loss": 3.6904, "step": 3776 }, { "epoch": 2.9857707509881424, "grad_norm": 2.1347529888153076, "learning_rate": 1.8931136422800644e-10, "loss": 4.3509, "step": 3777 }, { "epoch": 2.9865612648221345, "grad_norm": 1.9518482685089111, "learning_rate": 1.6990862672999762e-10, "loss": 4.2226, "step": 3778 }, { "epoch": 2.9873517786561266, "grad_norm": 2.183629035949707, "learning_rate": 1.5155460885096783e-10, "loss": 3.9102, "step": 3779 }, { "epoch": 2.9881422924901186, "grad_norm": 2.0159308910369873, "learning_rate": 1.3424932342431807e-10, "loss": 4.1046, "step": 3780 }, { "epoch": 2.9889328063241107, "grad_norm": 2.1142590045928955, "learning_rate": 1.1799278255053558e-10, "loss": 3.9422, "step": 3781 }, { "epoch": 2.989723320158103, "grad_norm": 2.0726335048675537, "learning_rate": 1.0278499759669435e-10, "loss": 3.968, "step": 3782 }, { "epoch": 2.990513833992095, "grad_norm": 1.957381010055542, "learning_rate": 8.862597919612192e-11, "loss": 3.9945, "step": 3783 }, { "epoch": 2.991304347826087, "grad_norm": 2.150315999984741, "learning_rate": 7.551573724939864e-11, "loss": 4.074, "step": 3784 }, { "epoch": 2.992094861660079, "grad_norm": 2.188199281692505, "learning_rate": 6.345428092369155e-11, "loss": 4.1437, "step": 3785 }, { "epoch": 2.992885375494071, "grad_norm": 2.1374096870422363, "learning_rate": 5.244161865225472e-11, "loss": 4.2765, "step": 3786 }, { "epoch": 2.9936758893280633, "grad_norm": 2.0292553901672363, "learning_rate": 4.24777581357616e-11, "loss": 4.2254, "step": 3787 }, { "epoch": 2.9944664031620554, "grad_norm": 3.3473474979400635, "learning_rate": 3.3562706341139227e-11, "loss": 3.9786, "step": 3788 }, { "epoch": 2.9952569169960475, "grad_norm": 2.037400245666504, "learning_rate": 2.5696469502067832e-11, "loss": 4.2059, "step": 3789 }, { "epoch": 2.9960474308300395, "grad_norm": 2.0290942192077637, "learning_rate": 1.8879053118647795e-11, "loss": 4.1682, "step": 3790 }, { "epoch": 2.9968379446640316, "grad_norm": 1.9987640380859375, "learning_rate": 1.3110461957732689e-11, "loss": 4.1246, "step": 3791 }, { "epoch": 2.9976284584980237, "grad_norm": 2.1732001304626465, "learning_rate": 8.390700053262368e-12, "loss": 3.7434, "step": 3792 }, { "epoch": 2.998418972332016, "grad_norm": 2.062969446182251, "learning_rate": 4.719770705097215e-12, "loss": 4.0229, "step": 3793 }, { "epoch": 2.999209486166008, "grad_norm": 2.084434747695923, "learning_rate": 2.097676480017352e-12, "loss": 4.1382, "step": 3794 }, { "epoch": 3.0, "grad_norm": 2.243303060531616, "learning_rate": 5.244192117226376e-13, "loss": 3.8611, "step": 3795 }, { "epoch": 3.0, "step": 3795, "total_flos": 3.843306670868398e+18, "train_loss": 4.010649446065247, "train_runtime": 2287.4936, "train_samples_per_second": 398.313, "train_steps_per_second": 1.659 } ], "logging_steps": 1.0, "max_steps": 3795, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.843306670868398e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null }