{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1881, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001594896331738437, "grad_norm": 55.58218859989758, "learning_rate": 2.6455026455026455e-07, "loss": 11.1619, "step": 1 }, { "epoch": 0.003189792663476874, "grad_norm": 54.72857616382741, "learning_rate": 5.291005291005291e-07, "loss": 11.1725, "step": 2 }, { "epoch": 0.004784688995215311, "grad_norm": 57.21395933420615, "learning_rate": 7.936507936507937e-07, "loss": 10.9821, "step": 3 }, { "epoch": 0.006379585326953748, "grad_norm": 56.258154469223996, "learning_rate": 1.0582010582010582e-06, "loss": 11.0189, "step": 4 }, { "epoch": 0.007974481658692184, "grad_norm": 56.396125627976325, "learning_rate": 1.3227513227513228e-06, "loss": 11.0261, "step": 5 }, { "epoch": 0.009569377990430622, "grad_norm": 58.028218869806174, "learning_rate": 1.5873015873015873e-06, "loss": 10.9743, "step": 6 }, { "epoch": 0.011164274322169059, "grad_norm": 59.07519067102789, "learning_rate": 1.8518518518518519e-06, "loss": 10.9181, "step": 7 }, { "epoch": 0.012759170653907496, "grad_norm": 59.95939351753766, "learning_rate": 2.1164021164021164e-06, "loss": 10.7761, "step": 8 }, { "epoch": 0.014354066985645933, "grad_norm": 63.00344296466596, "learning_rate": 2.3809523809523808e-06, "loss": 10.6495, "step": 9 }, { "epoch": 0.01594896331738437, "grad_norm": 88.0552470799903, "learning_rate": 2.6455026455026455e-06, "loss": 9.377, "step": 10 }, { "epoch": 0.017543859649122806, "grad_norm": 102.27519188453438, "learning_rate": 2.91005291005291e-06, "loss": 8.9753, "step": 11 }, { "epoch": 0.019138755980861243, "grad_norm": 109.99193103164863, "learning_rate": 3.1746031746031746e-06, "loss": 8.6955, "step": 12 }, { "epoch": 0.02073365231259968, "grad_norm": 63.702370040068715, "learning_rate": 3.439153439153439e-06, "loss": 3.5894, "step": 13 }, { "epoch": 0.022328548644338118, "grad_norm": 56.21700650763068, "learning_rate": 3.7037037037037037e-06, "loss": 3.3567, "step": 14 }, { "epoch": 0.023923444976076555, "grad_norm": 40.23159360264269, "learning_rate": 3.968253968253968e-06, "loss": 2.6852, "step": 15 }, { "epoch": 0.025518341307814992, "grad_norm": 39.790578375594094, "learning_rate": 4.232804232804233e-06, "loss": 2.4539, "step": 16 }, { "epoch": 0.02711323763955343, "grad_norm": 7.371958144998125, "learning_rate": 4.497354497354498e-06, "loss": 1.4916, "step": 17 }, { "epoch": 0.028708133971291867, "grad_norm": 4.834400576444921, "learning_rate": 4.7619047619047615e-06, "loss": 1.2944, "step": 18 }, { "epoch": 0.030303030303030304, "grad_norm": 4.11744436875918, "learning_rate": 5.026455026455026e-06, "loss": 1.281, "step": 19 }, { "epoch": 0.03189792663476874, "grad_norm": 3.0352225754985547, "learning_rate": 5.291005291005291e-06, "loss": 1.183, "step": 20 }, { "epoch": 0.03349282296650718, "grad_norm": 2.4191430168646133, "learning_rate": 5.555555555555556e-06, "loss": 1.1206, "step": 21 }, { "epoch": 0.03508771929824561, "grad_norm": 2.0081821282023644, "learning_rate": 5.82010582010582e-06, "loss": 1.1004, "step": 22 }, { "epoch": 0.03668261562998405, "grad_norm": 1.4307266470366664, "learning_rate": 6.0846560846560845e-06, "loss": 0.9908, "step": 23 }, { "epoch": 0.03827751196172249, "grad_norm": 19.832263264681906, "learning_rate": 6.349206349206349e-06, "loss": 0.9484, "step": 24 }, { "epoch": 0.03987240829346093, "grad_norm": 3.0476226265867585, "learning_rate": 6.613756613756614e-06, "loss": 0.9375, "step": 25 }, { "epoch": 0.04146730462519936, "grad_norm": 1.3225862580339947, "learning_rate": 6.878306878306878e-06, "loss": 0.912, "step": 26 }, { "epoch": 0.0430622009569378, "grad_norm": 1.0274431048876067, "learning_rate": 7.142857142857143e-06, "loss": 0.8625, "step": 27 }, { "epoch": 0.044657097288676235, "grad_norm": 0.9550025500524255, "learning_rate": 7.4074074074074075e-06, "loss": 0.8862, "step": 28 }, { "epoch": 0.046251993620414676, "grad_norm": 0.8830996875679307, "learning_rate": 7.671957671957672e-06, "loss": 0.8597, "step": 29 }, { "epoch": 0.04784688995215311, "grad_norm": 0.7152940556153602, "learning_rate": 7.936507936507936e-06, "loss": 0.7851, "step": 30 }, { "epoch": 0.049441786283891544, "grad_norm": 0.9580790328795702, "learning_rate": 8.201058201058202e-06, "loss": 0.7927, "step": 31 }, { "epoch": 0.051036682615629984, "grad_norm": 0.6830273132698791, "learning_rate": 8.465608465608466e-06, "loss": 0.7623, "step": 32 }, { "epoch": 0.05263157894736842, "grad_norm": 0.6577279279384521, "learning_rate": 8.73015873015873e-06, "loss": 0.7396, "step": 33 }, { "epoch": 0.05422647527910686, "grad_norm": 0.6614298188165546, "learning_rate": 8.994708994708995e-06, "loss": 0.7064, "step": 34 }, { "epoch": 0.05582137161084529, "grad_norm": 0.6818065339768941, "learning_rate": 9.259259259259259e-06, "loss": 0.7335, "step": 35 }, { "epoch": 0.05741626794258373, "grad_norm": 0.6015123600525427, "learning_rate": 9.523809523809523e-06, "loss": 0.7208, "step": 36 }, { "epoch": 0.05901116427432217, "grad_norm": 0.5992204759890278, "learning_rate": 9.788359788359789e-06, "loss": 0.7211, "step": 37 }, { "epoch": 0.06060606060606061, "grad_norm": 0.535504329561983, "learning_rate": 1.0052910052910053e-05, "loss": 0.6856, "step": 38 }, { "epoch": 0.06220095693779904, "grad_norm": 0.5056049484620948, "learning_rate": 1.0317460317460318e-05, "loss": 0.6987, "step": 39 }, { "epoch": 0.06379585326953748, "grad_norm": 0.525838219385248, "learning_rate": 1.0582010582010582e-05, "loss": 0.658, "step": 40 }, { "epoch": 0.06539074960127592, "grad_norm": 0.5020443669901373, "learning_rate": 1.0846560846560846e-05, "loss": 0.6867, "step": 41 }, { "epoch": 0.06698564593301436, "grad_norm": 0.48751125083787783, "learning_rate": 1.1111111111111112e-05, "loss": 0.6951, "step": 42 }, { "epoch": 0.0685805422647528, "grad_norm": 0.4506968233346597, "learning_rate": 1.1375661375661376e-05, "loss": 0.6716, "step": 43 }, { "epoch": 0.07017543859649122, "grad_norm": 0.4426776142565564, "learning_rate": 1.164021164021164e-05, "loss": 0.6663, "step": 44 }, { "epoch": 0.07177033492822966, "grad_norm": 0.48177599627884016, "learning_rate": 1.1904761904761905e-05, "loss": 0.6423, "step": 45 }, { "epoch": 0.0733652312599681, "grad_norm": 0.4491621826027903, "learning_rate": 1.2169312169312169e-05, "loss": 0.6559, "step": 46 }, { "epoch": 0.07496012759170653, "grad_norm": 0.4594732056084965, "learning_rate": 1.2433862433862433e-05, "loss": 0.6635, "step": 47 }, { "epoch": 0.07655502392344497, "grad_norm": 0.33680690637609145, "learning_rate": 1.2698412698412699e-05, "loss": 0.6136, "step": 48 }, { "epoch": 0.07814992025518341, "grad_norm": 0.4511454408052678, "learning_rate": 1.2962962962962962e-05, "loss": 0.6193, "step": 49 }, { "epoch": 0.07974481658692185, "grad_norm": 0.3668434651201649, "learning_rate": 1.3227513227513228e-05, "loss": 0.5947, "step": 50 }, { "epoch": 0.08133971291866028, "grad_norm": 0.32769130275932523, "learning_rate": 1.3492063492063492e-05, "loss": 0.6203, "step": 51 }, { "epoch": 0.08293460925039872, "grad_norm": 0.3230207781671609, "learning_rate": 1.3756613756613756e-05, "loss": 0.5971, "step": 52 }, { "epoch": 0.08452950558213716, "grad_norm": 0.33728890932212424, "learning_rate": 1.4021164021164022e-05, "loss": 0.618, "step": 53 }, { "epoch": 0.0861244019138756, "grad_norm": 0.34084967138156913, "learning_rate": 1.4285714285714285e-05, "loss": 0.6144, "step": 54 }, { "epoch": 0.08771929824561403, "grad_norm": 0.29624082179599687, "learning_rate": 1.455026455026455e-05, "loss": 0.6145, "step": 55 }, { "epoch": 0.08931419457735247, "grad_norm": 0.29616568973904767, "learning_rate": 1.4814814814814815e-05, "loss": 0.6068, "step": 56 }, { "epoch": 0.09090909090909091, "grad_norm": 0.3138191038422671, "learning_rate": 1.5079365079365079e-05, "loss": 0.6019, "step": 57 }, { "epoch": 0.09250398724082935, "grad_norm": 0.29656119208143994, "learning_rate": 1.5343915343915344e-05, "loss": 0.6226, "step": 58 }, { "epoch": 0.09409888357256778, "grad_norm": 0.2729975414406283, "learning_rate": 1.560846560846561e-05, "loss": 0.5863, "step": 59 }, { "epoch": 0.09569377990430622, "grad_norm": 0.2749356933515436, "learning_rate": 1.5873015873015872e-05, "loss": 0.5859, "step": 60 }, { "epoch": 0.09728867623604466, "grad_norm": 0.3085880359727754, "learning_rate": 1.6137566137566136e-05, "loss": 0.6003, "step": 61 }, { "epoch": 0.09888357256778309, "grad_norm": 0.2919076300009177, "learning_rate": 1.6402116402116404e-05, "loss": 0.5773, "step": 62 }, { "epoch": 0.10047846889952153, "grad_norm": 0.24667089010885335, "learning_rate": 1.6666666666666667e-05, "loss": 0.5714, "step": 63 }, { "epoch": 0.10207336523125997, "grad_norm": 0.2600901332832448, "learning_rate": 1.693121693121693e-05, "loss": 0.5842, "step": 64 }, { "epoch": 0.10366826156299841, "grad_norm": 0.30379915155228365, "learning_rate": 1.7195767195767195e-05, "loss": 0.6141, "step": 65 }, { "epoch": 0.10526315789473684, "grad_norm": 0.25157707969023857, "learning_rate": 1.746031746031746e-05, "loss": 0.5807, "step": 66 }, { "epoch": 0.10685805422647528, "grad_norm": 0.2727822990921217, "learning_rate": 1.7724867724867723e-05, "loss": 0.5701, "step": 67 }, { "epoch": 0.10845295055821372, "grad_norm": 0.2937920421953775, "learning_rate": 1.798941798941799e-05, "loss": 0.5833, "step": 68 }, { "epoch": 0.11004784688995216, "grad_norm": 0.25196562558458424, "learning_rate": 1.8253968253968254e-05, "loss": 0.5659, "step": 69 }, { "epoch": 0.11164274322169059, "grad_norm": 0.24745434949871412, "learning_rate": 1.8518518518518518e-05, "loss": 0.5785, "step": 70 }, { "epoch": 0.11323763955342903, "grad_norm": 0.2700967310532773, "learning_rate": 1.8783068783068782e-05, "loss": 0.5723, "step": 71 }, { "epoch": 0.11483253588516747, "grad_norm": 0.2987954094379614, "learning_rate": 1.9047619047619046e-05, "loss": 0.5655, "step": 72 }, { "epoch": 0.11642743221690591, "grad_norm": 0.24948156051115444, "learning_rate": 1.9312169312169313e-05, "loss": 0.5557, "step": 73 }, { "epoch": 0.11802232854864433, "grad_norm": 0.2641877706654791, "learning_rate": 1.9576719576719577e-05, "loss": 0.5586, "step": 74 }, { "epoch": 0.11961722488038277, "grad_norm": 0.2703559814675665, "learning_rate": 1.984126984126984e-05, "loss": 0.572, "step": 75 }, { "epoch": 0.12121212121212122, "grad_norm": 0.2616267509728662, "learning_rate": 2.0105820105820105e-05, "loss": 0.5618, "step": 76 }, { "epoch": 0.12280701754385964, "grad_norm": 0.2201388058285762, "learning_rate": 2.037037037037037e-05, "loss": 0.5576, "step": 77 }, { "epoch": 0.12440191387559808, "grad_norm": 0.23713310796968134, "learning_rate": 2.0634920634920636e-05, "loss": 0.5439, "step": 78 }, { "epoch": 0.12599681020733652, "grad_norm": 0.2715918933842359, "learning_rate": 2.08994708994709e-05, "loss": 0.5546, "step": 79 }, { "epoch": 0.12759170653907495, "grad_norm": 0.2566362464601795, "learning_rate": 2.1164021164021164e-05, "loss": 0.5552, "step": 80 }, { "epoch": 0.1291866028708134, "grad_norm": 0.2582086790181626, "learning_rate": 2.1428571428571428e-05, "loss": 0.5453, "step": 81 }, { "epoch": 0.13078149920255183, "grad_norm": 0.24193458217326355, "learning_rate": 2.1693121693121692e-05, "loss": 0.5527, "step": 82 }, { "epoch": 0.13237639553429026, "grad_norm": 0.23613230778696434, "learning_rate": 2.1957671957671956e-05, "loss": 0.5392, "step": 83 }, { "epoch": 0.1339712918660287, "grad_norm": 0.27250473743432846, "learning_rate": 2.2222222222222223e-05, "loss": 0.5483, "step": 84 }, { "epoch": 0.13556618819776714, "grad_norm": 0.24980743790157978, "learning_rate": 2.2486772486772487e-05, "loss": 0.5533, "step": 85 }, { "epoch": 0.1371610845295056, "grad_norm": 0.25041888307311505, "learning_rate": 2.275132275132275e-05, "loss": 0.5419, "step": 86 }, { "epoch": 0.13875598086124402, "grad_norm": 0.28709534951381654, "learning_rate": 2.3015873015873015e-05, "loss": 0.5417, "step": 87 }, { "epoch": 0.14035087719298245, "grad_norm": 0.26626343099038363, "learning_rate": 2.328042328042328e-05, "loss": 0.5571, "step": 88 }, { "epoch": 0.1419457735247209, "grad_norm": 0.2731893926684258, "learning_rate": 2.3544973544973546e-05, "loss": 0.5619, "step": 89 }, { "epoch": 0.14354066985645933, "grad_norm": 0.2914433482458827, "learning_rate": 2.380952380952381e-05, "loss": 0.557, "step": 90 }, { "epoch": 0.14513556618819776, "grad_norm": 0.2356154500209484, "learning_rate": 2.4074074074074074e-05, "loss": 0.5274, "step": 91 }, { "epoch": 0.1467304625199362, "grad_norm": 0.25183708098244206, "learning_rate": 2.4338624338624338e-05, "loss": 0.5511, "step": 92 }, { "epoch": 0.14832535885167464, "grad_norm": 0.27569577906742787, "learning_rate": 2.4603174603174602e-05, "loss": 0.5365, "step": 93 }, { "epoch": 0.14992025518341306, "grad_norm": 0.2607352893567811, "learning_rate": 2.4867724867724866e-05, "loss": 0.5414, "step": 94 }, { "epoch": 0.15151515151515152, "grad_norm": 0.27215732487534494, "learning_rate": 2.5132275132275137e-05, "loss": 0.529, "step": 95 }, { "epoch": 0.15311004784688995, "grad_norm": 0.2522187183943027, "learning_rate": 2.5396825396825397e-05, "loss": 0.5265, "step": 96 }, { "epoch": 0.1547049441786284, "grad_norm": 0.28300034244540423, "learning_rate": 2.5661375661375664e-05, "loss": 0.5434, "step": 97 }, { "epoch": 0.15629984051036683, "grad_norm": 0.2916353653498682, "learning_rate": 2.5925925925925925e-05, "loss": 0.5302, "step": 98 }, { "epoch": 0.15789473684210525, "grad_norm": 0.27047732133962227, "learning_rate": 2.6190476190476192e-05, "loss": 0.5337, "step": 99 }, { "epoch": 0.1594896331738437, "grad_norm": 0.25645158418862735, "learning_rate": 2.6455026455026456e-05, "loss": 0.5247, "step": 100 }, { "epoch": 0.16108452950558214, "grad_norm": 0.25002459865535176, "learning_rate": 2.6719576719576723e-05, "loss": 0.5274, "step": 101 }, { "epoch": 0.16267942583732056, "grad_norm": 0.28319110066592224, "learning_rate": 2.6984126984126984e-05, "loss": 0.536, "step": 102 }, { "epoch": 0.16427432216905902, "grad_norm": 0.2479781126492837, "learning_rate": 2.724867724867725e-05, "loss": 0.5056, "step": 103 }, { "epoch": 0.16586921850079744, "grad_norm": 0.26268846214005237, "learning_rate": 2.7513227513227512e-05, "loss": 0.53, "step": 104 }, { "epoch": 0.1674641148325359, "grad_norm": 0.26401097679694047, "learning_rate": 2.777777777777778e-05, "loss": 0.5327, "step": 105 }, { "epoch": 0.16905901116427433, "grad_norm": 0.26459644010805233, "learning_rate": 2.8042328042328043e-05, "loss": 0.519, "step": 106 }, { "epoch": 0.17065390749601275, "grad_norm": 0.2861962302180895, "learning_rate": 2.830687830687831e-05, "loss": 0.5228, "step": 107 }, { "epoch": 0.1722488038277512, "grad_norm": 0.39849714853173235, "learning_rate": 2.857142857142857e-05, "loss": 0.5278, "step": 108 }, { "epoch": 0.17384370015948963, "grad_norm": 0.31130049266878534, "learning_rate": 2.8835978835978838e-05, "loss": 0.533, "step": 109 }, { "epoch": 0.17543859649122806, "grad_norm": 0.26999235678825106, "learning_rate": 2.91005291005291e-05, "loss": 0.516, "step": 110 }, { "epoch": 0.17703349282296652, "grad_norm": 0.2808092473569607, "learning_rate": 2.9365079365079366e-05, "loss": 0.5252, "step": 111 }, { "epoch": 0.17862838915470494, "grad_norm": 0.30505182125439334, "learning_rate": 2.962962962962963e-05, "loss": 0.5237, "step": 112 }, { "epoch": 0.18022328548644337, "grad_norm": 0.3053013943169774, "learning_rate": 2.9894179894179897e-05, "loss": 0.505, "step": 113 }, { "epoch": 0.18181818181818182, "grad_norm": 0.3407039080324523, "learning_rate": 3.0158730158730158e-05, "loss": 0.5167, "step": 114 }, { "epoch": 0.18341307814992025, "grad_norm": 0.2964935750847215, "learning_rate": 3.0423280423280425e-05, "loss": 0.5054, "step": 115 }, { "epoch": 0.1850079744816587, "grad_norm": 0.322379179453101, "learning_rate": 3.068783068783069e-05, "loss": 0.5139, "step": 116 }, { "epoch": 0.18660287081339713, "grad_norm": 0.30088694957209566, "learning_rate": 3.095238095238095e-05, "loss": 0.529, "step": 117 }, { "epoch": 0.18819776714513556, "grad_norm": 0.32946246516193456, "learning_rate": 3.121693121693122e-05, "loss": 0.4964, "step": 118 }, { "epoch": 0.189792663476874, "grad_norm": 0.3281400457723929, "learning_rate": 3.148148148148148e-05, "loss": 0.5243, "step": 119 }, { "epoch": 0.19138755980861244, "grad_norm": 0.2796262875669856, "learning_rate": 3.1746031746031745e-05, "loss": 0.5176, "step": 120 }, { "epoch": 0.19298245614035087, "grad_norm": 0.32193972087701545, "learning_rate": 3.2010582010582015e-05, "loss": 0.5062, "step": 121 }, { "epoch": 0.19457735247208932, "grad_norm": 0.32724679785588084, "learning_rate": 3.227513227513227e-05, "loss": 0.5412, "step": 122 }, { "epoch": 0.19617224880382775, "grad_norm": 0.27986536982396915, "learning_rate": 3.253968253968254e-05, "loss": 0.5055, "step": 123 }, { "epoch": 0.19776714513556617, "grad_norm": 0.3172850214858539, "learning_rate": 3.280423280423281e-05, "loss": 0.504, "step": 124 }, { "epoch": 0.19936204146730463, "grad_norm": 0.30107038750324344, "learning_rate": 3.306878306878307e-05, "loss": 0.5313, "step": 125 }, { "epoch": 0.20095693779904306, "grad_norm": 0.2863230714326002, "learning_rate": 3.3333333333333335e-05, "loss": 0.509, "step": 126 }, { "epoch": 0.2025518341307815, "grad_norm": 0.2660996791342997, "learning_rate": 3.35978835978836e-05, "loss": 0.4979, "step": 127 }, { "epoch": 0.20414673046251994, "grad_norm": 0.3038229268680537, "learning_rate": 3.386243386243386e-05, "loss": 0.5188, "step": 128 }, { "epoch": 0.20574162679425836, "grad_norm": 0.2736543489065968, "learning_rate": 3.412698412698413e-05, "loss": 0.5192, "step": 129 }, { "epoch": 0.20733652312599682, "grad_norm": 0.2804164805258564, "learning_rate": 3.439153439153439e-05, "loss": 0.4871, "step": 130 }, { "epoch": 0.20893141945773525, "grad_norm": 0.2948410434553801, "learning_rate": 3.465608465608466e-05, "loss": 0.4984, "step": 131 }, { "epoch": 0.21052631578947367, "grad_norm": 0.3136957109059132, "learning_rate": 3.492063492063492e-05, "loss": 0.4909, "step": 132 }, { "epoch": 0.21212121212121213, "grad_norm": 0.33726244142223305, "learning_rate": 3.518518518518519e-05, "loss": 0.4994, "step": 133 }, { "epoch": 0.21371610845295055, "grad_norm": 0.2837701080947948, "learning_rate": 3.5449735449735446e-05, "loss": 0.5073, "step": 134 }, { "epoch": 0.215311004784689, "grad_norm": 0.3062673121977755, "learning_rate": 3.571428571428572e-05, "loss": 0.5059, "step": 135 }, { "epoch": 0.21690590111642744, "grad_norm": 0.3235774642931013, "learning_rate": 3.597883597883598e-05, "loss": 0.5002, "step": 136 }, { "epoch": 0.21850079744816586, "grad_norm": 0.28873119907837386, "learning_rate": 3.6243386243386245e-05, "loss": 0.5283, "step": 137 }, { "epoch": 0.22009569377990432, "grad_norm": 0.3128688619546131, "learning_rate": 3.650793650793651e-05, "loss": 0.4765, "step": 138 }, { "epoch": 0.22169059011164274, "grad_norm": 0.31375095895716076, "learning_rate": 3.677248677248677e-05, "loss": 0.5057, "step": 139 }, { "epoch": 0.22328548644338117, "grad_norm": 0.32494275954709806, "learning_rate": 3.7037037037037037e-05, "loss": 0.5153, "step": 140 }, { "epoch": 0.22488038277511962, "grad_norm": 0.35374717902306224, "learning_rate": 3.730158730158731e-05, "loss": 0.5019, "step": 141 }, { "epoch": 0.22647527910685805, "grad_norm": 0.33547573856435725, "learning_rate": 3.7566137566137564e-05, "loss": 0.501, "step": 142 }, { "epoch": 0.22807017543859648, "grad_norm": 0.3516755535075931, "learning_rate": 3.7830687830687835e-05, "loss": 0.5283, "step": 143 }, { "epoch": 0.22966507177033493, "grad_norm": 0.3035060601895851, "learning_rate": 3.809523809523809e-05, "loss": 0.4971, "step": 144 }, { "epoch": 0.23125996810207336, "grad_norm": 0.3853861233317544, "learning_rate": 3.835978835978836e-05, "loss": 0.5217, "step": 145 }, { "epoch": 0.23285486443381181, "grad_norm": 0.30604943288677555, "learning_rate": 3.862433862433863e-05, "loss": 0.5023, "step": 146 }, { "epoch": 0.23444976076555024, "grad_norm": 0.34840076818647264, "learning_rate": 3.888888888888889e-05, "loss": 0.5024, "step": 147 }, { "epoch": 0.23604465709728867, "grad_norm": 0.2857344865019342, "learning_rate": 3.9153439153439155e-05, "loss": 0.4849, "step": 148 }, { "epoch": 0.23763955342902712, "grad_norm": 0.34204579915919253, "learning_rate": 3.941798941798942e-05, "loss": 0.4752, "step": 149 }, { "epoch": 0.23923444976076555, "grad_norm": 0.2804362658790948, "learning_rate": 3.968253968253968e-05, "loss": 0.4908, "step": 150 }, { "epoch": 0.24082934609250398, "grad_norm": 0.33293803557589996, "learning_rate": 3.9947089947089946e-05, "loss": 0.5106, "step": 151 }, { "epoch": 0.24242424242424243, "grad_norm": 0.3088460880507483, "learning_rate": 4.021164021164021e-05, "loss": 0.5051, "step": 152 }, { "epoch": 0.24401913875598086, "grad_norm": 0.31569287004925345, "learning_rate": 4.047619047619048e-05, "loss": 0.5144, "step": 153 }, { "epoch": 0.24561403508771928, "grad_norm": 0.32246526184623026, "learning_rate": 4.074074074074074e-05, "loss": 0.5095, "step": 154 }, { "epoch": 0.24720893141945774, "grad_norm": 0.3531142966032932, "learning_rate": 4.100529100529101e-05, "loss": 0.518, "step": 155 }, { "epoch": 0.24880382775119617, "grad_norm": 0.3329978447326442, "learning_rate": 4.126984126984127e-05, "loss": 0.4902, "step": 156 }, { "epoch": 0.2503987240829346, "grad_norm": 0.38597772683156834, "learning_rate": 4.153439153439154e-05, "loss": 0.4929, "step": 157 }, { "epoch": 0.25199362041467305, "grad_norm": 0.34517032573412837, "learning_rate": 4.17989417989418e-05, "loss": 0.5007, "step": 158 }, { "epoch": 0.2535885167464115, "grad_norm": 0.3559821944197488, "learning_rate": 4.2063492063492065e-05, "loss": 0.4876, "step": 159 }, { "epoch": 0.2551834130781499, "grad_norm": 0.3062431178049228, "learning_rate": 4.232804232804233e-05, "loss": 0.4902, "step": 160 }, { "epoch": 0.2567783094098884, "grad_norm": 0.36454113708612174, "learning_rate": 4.259259259259259e-05, "loss": 0.4928, "step": 161 }, { "epoch": 0.2583732057416268, "grad_norm": 0.3594805733026915, "learning_rate": 4.2857142857142856e-05, "loss": 0.5129, "step": 162 }, { "epoch": 0.25996810207336524, "grad_norm": 0.3496088393259862, "learning_rate": 4.312169312169313e-05, "loss": 0.4838, "step": 163 }, { "epoch": 0.26156299840510366, "grad_norm": 0.32650290329826104, "learning_rate": 4.3386243386243384e-05, "loss": 0.4932, "step": 164 }, { "epoch": 0.2631578947368421, "grad_norm": 0.3907102086216344, "learning_rate": 4.3650793650793655e-05, "loss": 0.5186, "step": 165 }, { "epoch": 0.2647527910685805, "grad_norm": 0.31861936078766145, "learning_rate": 4.391534391534391e-05, "loss": 0.4883, "step": 166 }, { "epoch": 0.266347687400319, "grad_norm": 0.42940791332441897, "learning_rate": 4.417989417989418e-05, "loss": 0.512, "step": 167 }, { "epoch": 0.2679425837320574, "grad_norm": 0.43051380994709115, "learning_rate": 4.4444444444444447e-05, "loss": 0.4907, "step": 168 }, { "epoch": 0.26953748006379585, "grad_norm": 0.4297097093997621, "learning_rate": 4.470899470899471e-05, "loss": 0.4872, "step": 169 }, { "epoch": 0.2711323763955343, "grad_norm": 0.3784917044688456, "learning_rate": 4.4973544973544974e-05, "loss": 0.5003, "step": 170 }, { "epoch": 0.2727272727272727, "grad_norm": 0.3287283106553596, "learning_rate": 4.523809523809524e-05, "loss": 0.5088, "step": 171 }, { "epoch": 0.2743221690590112, "grad_norm": 0.3846418099877241, "learning_rate": 4.55026455026455e-05, "loss": 0.4801, "step": 172 }, { "epoch": 0.2759170653907496, "grad_norm": 0.34310577517844726, "learning_rate": 4.576719576719577e-05, "loss": 0.5071, "step": 173 }, { "epoch": 0.27751196172248804, "grad_norm": 0.39427331425444484, "learning_rate": 4.603174603174603e-05, "loss": 0.4917, "step": 174 }, { "epoch": 0.27910685805422647, "grad_norm": 0.4224179643135878, "learning_rate": 4.62962962962963e-05, "loss": 0.471, "step": 175 }, { "epoch": 0.2807017543859649, "grad_norm": 0.3100375396304059, "learning_rate": 4.656084656084656e-05, "loss": 0.4995, "step": 176 }, { "epoch": 0.2822966507177033, "grad_norm": 0.474242870036674, "learning_rate": 4.682539682539683e-05, "loss": 0.5191, "step": 177 }, { "epoch": 0.2838915470494418, "grad_norm": 0.3864410742870871, "learning_rate": 4.708994708994709e-05, "loss": 0.5077, "step": 178 }, { "epoch": 0.28548644338118023, "grad_norm": 0.5084776075357123, "learning_rate": 4.7354497354497356e-05, "loss": 0.4735, "step": 179 }, { "epoch": 0.28708133971291866, "grad_norm": 0.32866754189751185, "learning_rate": 4.761904761904762e-05, "loss": 0.4828, "step": 180 }, { "epoch": 0.2886762360446571, "grad_norm": 0.5125932146602514, "learning_rate": 4.7883597883597884e-05, "loss": 0.492, "step": 181 }, { "epoch": 0.2902711323763955, "grad_norm": 0.357694635141064, "learning_rate": 4.814814814814815e-05, "loss": 0.4855, "step": 182 }, { "epoch": 0.291866028708134, "grad_norm": 0.5075208572353158, "learning_rate": 4.841269841269841e-05, "loss": 0.507, "step": 183 }, { "epoch": 0.2934609250398724, "grad_norm": 0.40333535858357894, "learning_rate": 4.8677248677248676e-05, "loss": 0.4861, "step": 184 }, { "epoch": 0.29505582137161085, "grad_norm": 0.37136169692696386, "learning_rate": 4.894179894179895e-05, "loss": 0.4737, "step": 185 }, { "epoch": 0.2966507177033493, "grad_norm": 0.40355343432971863, "learning_rate": 4.9206349206349204e-05, "loss": 0.4825, "step": 186 }, { "epoch": 0.2982456140350877, "grad_norm": 0.3490222556327294, "learning_rate": 4.9470899470899475e-05, "loss": 0.4779, "step": 187 }, { "epoch": 0.29984051036682613, "grad_norm": 0.40293595464635956, "learning_rate": 4.973544973544973e-05, "loss": 0.5065, "step": 188 }, { "epoch": 0.3014354066985646, "grad_norm": 0.4448933421982631, "learning_rate": 5e-05, "loss": 0.4803, "step": 189 }, { "epoch": 0.30303030303030304, "grad_norm": 0.42952345136153947, "learning_rate": 4.9970449172576836e-05, "loss": 0.4914, "step": 190 }, { "epoch": 0.30462519936204147, "grad_norm": 0.3474082899519934, "learning_rate": 4.994089834515367e-05, "loss": 0.4872, "step": 191 }, { "epoch": 0.3062200956937799, "grad_norm": 0.36055515453296016, "learning_rate": 4.99113475177305e-05, "loss": 0.4978, "step": 192 }, { "epoch": 0.3078149920255183, "grad_norm": 0.32082146287661295, "learning_rate": 4.988179669030733e-05, "loss": 0.4829, "step": 193 }, { "epoch": 0.3094098883572568, "grad_norm": 0.36499928051563435, "learning_rate": 4.985224586288416e-05, "loss": 0.4808, "step": 194 }, { "epoch": 0.31100478468899523, "grad_norm": 0.3279790584256245, "learning_rate": 4.9822695035461e-05, "loss": 0.4871, "step": 195 }, { "epoch": 0.31259968102073366, "grad_norm": 0.3073428743219709, "learning_rate": 4.979314420803783e-05, "loss": 0.4937, "step": 196 }, { "epoch": 0.3141945773524721, "grad_norm": 0.3978777934720302, "learning_rate": 4.976359338061466e-05, "loss": 0.4653, "step": 197 }, { "epoch": 0.3157894736842105, "grad_norm": 0.38212880396120324, "learning_rate": 4.973404255319149e-05, "loss": 0.4705, "step": 198 }, { "epoch": 0.31738437001594894, "grad_norm": 0.33153411737491273, "learning_rate": 4.970449172576833e-05, "loss": 0.4736, "step": 199 }, { "epoch": 0.3189792663476874, "grad_norm": 0.36046780524241595, "learning_rate": 4.967494089834516e-05, "loss": 0.4733, "step": 200 }, { "epoch": 0.32057416267942584, "grad_norm": 0.2987618176182356, "learning_rate": 4.964539007092199e-05, "loss": 0.4815, "step": 201 }, { "epoch": 0.32216905901116427, "grad_norm": 0.37902527918601653, "learning_rate": 4.961583924349882e-05, "loss": 0.4705, "step": 202 }, { "epoch": 0.3237639553429027, "grad_norm": 0.27853323357241183, "learning_rate": 4.958628841607565e-05, "loss": 0.4775, "step": 203 }, { "epoch": 0.3253588516746411, "grad_norm": 0.5007159466037976, "learning_rate": 4.9556737588652486e-05, "loss": 0.5148, "step": 204 }, { "epoch": 0.3269537480063796, "grad_norm": 0.3065952486317295, "learning_rate": 4.9527186761229313e-05, "loss": 0.4722, "step": 205 }, { "epoch": 0.32854864433811803, "grad_norm": 0.3839551224750691, "learning_rate": 4.949763593380615e-05, "loss": 0.4771, "step": 206 }, { "epoch": 0.33014354066985646, "grad_norm": 0.3906437422093606, "learning_rate": 4.946808510638298e-05, "loss": 0.4903, "step": 207 }, { "epoch": 0.3317384370015949, "grad_norm": 0.3043787837529888, "learning_rate": 4.9438534278959816e-05, "loss": 0.4677, "step": 208 }, { "epoch": 0.3333333333333333, "grad_norm": 0.43719961082811304, "learning_rate": 4.940898345153664e-05, "loss": 0.4861, "step": 209 }, { "epoch": 0.3349282296650718, "grad_norm": 0.39652056373638994, "learning_rate": 4.937943262411348e-05, "loss": 0.4934, "step": 210 }, { "epoch": 0.3365231259968102, "grad_norm": 0.39924305266290544, "learning_rate": 4.934988179669031e-05, "loss": 0.476, "step": 211 }, { "epoch": 0.33811802232854865, "grad_norm": 0.3400254174301479, "learning_rate": 4.9320330969267145e-05, "loss": 0.4804, "step": 212 }, { "epoch": 0.3397129186602871, "grad_norm": 0.39555150101230785, "learning_rate": 4.929078014184397e-05, "loss": 0.5014, "step": 213 }, { "epoch": 0.3413078149920255, "grad_norm": 0.43380897093723153, "learning_rate": 4.9261229314420806e-05, "loss": 0.4769, "step": 214 }, { "epoch": 0.34290271132376393, "grad_norm": 0.32234942147500883, "learning_rate": 4.923167848699764e-05, "loss": 0.4868, "step": 215 }, { "epoch": 0.3444976076555024, "grad_norm": 0.4946046049535497, "learning_rate": 4.920212765957447e-05, "loss": 0.4871, "step": 216 }, { "epoch": 0.34609250398724084, "grad_norm": 0.424744015343933, "learning_rate": 4.91725768321513e-05, "loss": 0.4929, "step": 217 }, { "epoch": 0.34768740031897927, "grad_norm": 0.3935597107130945, "learning_rate": 4.914302600472813e-05, "loss": 0.4825, "step": 218 }, { "epoch": 0.3492822966507177, "grad_norm": 0.44922102055062135, "learning_rate": 4.911347517730497e-05, "loss": 0.496, "step": 219 }, { "epoch": 0.3508771929824561, "grad_norm": 0.5215874658597306, "learning_rate": 4.90839243498818e-05, "loss": 0.483, "step": 220 }, { "epoch": 0.3524720893141946, "grad_norm": 0.44546894937760306, "learning_rate": 4.905437352245863e-05, "loss": 0.4531, "step": 221 }, { "epoch": 0.35406698564593303, "grad_norm": 0.5086427339921723, "learning_rate": 4.9024822695035465e-05, "loss": 0.4623, "step": 222 }, { "epoch": 0.35566188197767146, "grad_norm": 0.5552026242135457, "learning_rate": 4.89952718676123e-05, "loss": 0.4766, "step": 223 }, { "epoch": 0.3572567783094099, "grad_norm": 0.3950192706646122, "learning_rate": 4.896572104018913e-05, "loss": 0.4738, "step": 224 }, { "epoch": 0.3588516746411483, "grad_norm": 0.4742572691288469, "learning_rate": 4.893617021276596e-05, "loss": 0.4813, "step": 225 }, { "epoch": 0.36044657097288674, "grad_norm": 0.39502450872567985, "learning_rate": 4.8906619385342795e-05, "loss": 0.454, "step": 226 }, { "epoch": 0.3620414673046252, "grad_norm": 0.4074486159222281, "learning_rate": 4.887706855791962e-05, "loss": 0.4828, "step": 227 }, { "epoch": 0.36363636363636365, "grad_norm": 0.42129891354246873, "learning_rate": 4.8847517730496456e-05, "loss": 0.4726, "step": 228 }, { "epoch": 0.3652312599681021, "grad_norm": 0.4830821115531617, "learning_rate": 4.8817966903073283e-05, "loss": 0.4589, "step": 229 }, { "epoch": 0.3668261562998405, "grad_norm": 0.3527785418825981, "learning_rate": 4.8788416075650124e-05, "loss": 0.4964, "step": 230 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5364224486953513, "learning_rate": 4.875886524822695e-05, "loss": 0.4692, "step": 231 }, { "epoch": 0.3700159489633174, "grad_norm": 0.3621310584557245, "learning_rate": 4.8729314420803786e-05, "loss": 0.4763, "step": 232 }, { "epoch": 0.37161084529505584, "grad_norm": 0.4835960678760354, "learning_rate": 4.869976359338061e-05, "loss": 0.4851, "step": 233 }, { "epoch": 0.37320574162679426, "grad_norm": 0.38603880575858307, "learning_rate": 4.8670212765957454e-05, "loss": 0.4566, "step": 234 }, { "epoch": 0.3748006379585327, "grad_norm": 0.38609445027960143, "learning_rate": 4.864066193853428e-05, "loss": 0.4717, "step": 235 }, { "epoch": 0.3763955342902711, "grad_norm": 0.32712024839159276, "learning_rate": 4.8611111111111115e-05, "loss": 0.4723, "step": 236 }, { "epoch": 0.37799043062200954, "grad_norm": 0.48803372605758294, "learning_rate": 4.858156028368794e-05, "loss": 0.507, "step": 237 }, { "epoch": 0.379585326953748, "grad_norm": 0.3235883013156243, "learning_rate": 4.8552009456264776e-05, "loss": 0.4857, "step": 238 }, { "epoch": 0.38118022328548645, "grad_norm": 0.4743879772164701, "learning_rate": 4.852245862884161e-05, "loss": 0.4626, "step": 239 }, { "epoch": 0.3827751196172249, "grad_norm": 0.26817465682157243, "learning_rate": 4.8492907801418445e-05, "loss": 0.4628, "step": 240 }, { "epoch": 0.3843700159489633, "grad_norm": 0.39802776297309767, "learning_rate": 4.846335697399527e-05, "loss": 0.4905, "step": 241 }, { "epoch": 0.38596491228070173, "grad_norm": 0.2713574283884333, "learning_rate": 4.8433806146572106e-05, "loss": 0.4475, "step": 242 }, { "epoch": 0.3875598086124402, "grad_norm": 0.28938974099838244, "learning_rate": 4.840425531914894e-05, "loss": 0.4555, "step": 243 }, { "epoch": 0.38915470494417864, "grad_norm": 0.28554196352645056, "learning_rate": 4.837470449172577e-05, "loss": 0.4654, "step": 244 }, { "epoch": 0.39074960127591707, "grad_norm": 0.35360436916389715, "learning_rate": 4.83451536643026e-05, "loss": 0.4696, "step": 245 }, { "epoch": 0.3923444976076555, "grad_norm": 0.2570414359203994, "learning_rate": 4.8315602836879435e-05, "loss": 0.4787, "step": 246 }, { "epoch": 0.3939393939393939, "grad_norm": 0.3321599083195138, "learning_rate": 4.828605200945627e-05, "loss": 0.4755, "step": 247 }, { "epoch": 0.39553429027113235, "grad_norm": 0.2513608398204412, "learning_rate": 4.82565011820331e-05, "loss": 0.4564, "step": 248 }, { "epoch": 0.39712918660287083, "grad_norm": 0.3062279216303403, "learning_rate": 4.822695035460993e-05, "loss": 0.45, "step": 249 }, { "epoch": 0.39872408293460926, "grad_norm": 0.2754098920923985, "learning_rate": 4.8197399527186765e-05, "loss": 0.4819, "step": 250 }, { "epoch": 0.4003189792663477, "grad_norm": 0.2882271157934727, "learning_rate": 4.81678486997636e-05, "loss": 0.4596, "step": 251 }, { "epoch": 0.4019138755980861, "grad_norm": 0.3134684413178123, "learning_rate": 4.8138297872340426e-05, "loss": 0.4644, "step": 252 }, { "epoch": 0.40350877192982454, "grad_norm": 0.2713972895593276, "learning_rate": 4.810874704491726e-05, "loss": 0.481, "step": 253 }, { "epoch": 0.405103668261563, "grad_norm": 0.36764170542656727, "learning_rate": 4.8079196217494094e-05, "loss": 0.4717, "step": 254 }, { "epoch": 0.40669856459330145, "grad_norm": 0.2733815320920827, "learning_rate": 4.804964539007092e-05, "loss": 0.4883, "step": 255 }, { "epoch": 0.4082934609250399, "grad_norm": 0.33591549215184496, "learning_rate": 4.8020094562647756e-05, "loss": 0.4786, "step": 256 }, { "epoch": 0.4098883572567783, "grad_norm": 0.27553760655522397, "learning_rate": 4.799054373522459e-05, "loss": 0.4543, "step": 257 }, { "epoch": 0.41148325358851673, "grad_norm": 0.3019197176335439, "learning_rate": 4.7960992907801424e-05, "loss": 0.4636, "step": 258 }, { "epoch": 0.4130781499202552, "grad_norm": 0.27388843929389495, "learning_rate": 4.793144208037825e-05, "loss": 0.4908, "step": 259 }, { "epoch": 0.41467304625199364, "grad_norm": 0.2722097257612581, "learning_rate": 4.7901891252955085e-05, "loss": 0.4527, "step": 260 }, { "epoch": 0.41626794258373206, "grad_norm": 0.2921519175150001, "learning_rate": 4.787234042553192e-05, "loss": 0.4746, "step": 261 }, { "epoch": 0.4178628389154705, "grad_norm": 0.27351789759891154, "learning_rate": 4.784278959810875e-05, "loss": 0.4671, "step": 262 }, { "epoch": 0.4194577352472089, "grad_norm": 0.3071365614244999, "learning_rate": 4.781323877068558e-05, "loss": 0.4783, "step": 263 }, { "epoch": 0.42105263157894735, "grad_norm": 0.2795337446954575, "learning_rate": 4.7783687943262415e-05, "loss": 0.4851, "step": 264 }, { "epoch": 0.4226475279106858, "grad_norm": 0.3113909123611502, "learning_rate": 4.775413711583925e-05, "loss": 0.4659, "step": 265 }, { "epoch": 0.42424242424242425, "grad_norm": 0.31624720132242584, "learning_rate": 4.7724586288416076e-05, "loss": 0.4879, "step": 266 }, { "epoch": 0.4258373205741627, "grad_norm": 0.35727315921133795, "learning_rate": 4.769503546099291e-05, "loss": 0.4817, "step": 267 }, { "epoch": 0.4274322169059011, "grad_norm": 0.2692119621276329, "learning_rate": 4.766548463356974e-05, "loss": 0.4616, "step": 268 }, { "epoch": 0.42902711323763953, "grad_norm": 0.36714171937887286, "learning_rate": 4.763593380614658e-05, "loss": 0.4821, "step": 269 }, { "epoch": 0.430622009569378, "grad_norm": 0.2691916243829859, "learning_rate": 4.7606382978723405e-05, "loss": 0.4599, "step": 270 }, { "epoch": 0.43221690590111644, "grad_norm": 0.3653351373637106, "learning_rate": 4.757683215130024e-05, "loss": 0.4721, "step": 271 }, { "epoch": 0.43381180223285487, "grad_norm": 0.2734698239450601, "learning_rate": 4.754728132387707e-05, "loss": 0.4789, "step": 272 }, { "epoch": 0.4354066985645933, "grad_norm": 0.2835679915716268, "learning_rate": 4.751773049645391e-05, "loss": 0.4404, "step": 273 }, { "epoch": 0.4370015948963317, "grad_norm": 0.2900741967997515, "learning_rate": 4.7488179669030735e-05, "loss": 0.4624, "step": 274 }, { "epoch": 0.43859649122807015, "grad_norm": 0.3105007263619072, "learning_rate": 4.745862884160757e-05, "loss": 0.444, "step": 275 }, { "epoch": 0.44019138755980863, "grad_norm": 0.2836126632920893, "learning_rate": 4.7429078014184396e-05, "loss": 0.4722, "step": 276 }, { "epoch": 0.44178628389154706, "grad_norm": 0.3034978963693795, "learning_rate": 4.739952718676123e-05, "loss": 0.4786, "step": 277 }, { "epoch": 0.4433811802232855, "grad_norm": 0.29794294171608177, "learning_rate": 4.7369976359338064e-05, "loss": 0.4695, "step": 278 }, { "epoch": 0.4449760765550239, "grad_norm": 0.27403750193717574, "learning_rate": 4.734042553191489e-05, "loss": 0.4574, "step": 279 }, { "epoch": 0.44657097288676234, "grad_norm": 0.3117815922235126, "learning_rate": 4.7310874704491726e-05, "loss": 0.4849, "step": 280 }, { "epoch": 0.4481658692185008, "grad_norm": 0.3306610156372188, "learning_rate": 4.728132387706856e-05, "loss": 0.4777, "step": 281 }, { "epoch": 0.44976076555023925, "grad_norm": 0.3077544251897433, "learning_rate": 4.7251773049645394e-05, "loss": 0.4622, "step": 282 }, { "epoch": 0.4513556618819777, "grad_norm": 0.2768826890067544, "learning_rate": 4.722222222222222e-05, "loss": 0.4694, "step": 283 }, { "epoch": 0.4529505582137161, "grad_norm": 0.27421463376160266, "learning_rate": 4.7192671394799055e-05, "loss": 0.471, "step": 284 }, { "epoch": 0.45454545454545453, "grad_norm": 0.3318919895138752, "learning_rate": 4.716312056737589e-05, "loss": 0.4821, "step": 285 }, { "epoch": 0.45614035087719296, "grad_norm": 0.2855925254298236, "learning_rate": 4.713356973995272e-05, "loss": 0.4583, "step": 286 }, { "epoch": 0.45773524720893144, "grad_norm": 0.35604960608845276, "learning_rate": 4.710401891252955e-05, "loss": 0.4907, "step": 287 }, { "epoch": 0.45933014354066987, "grad_norm": 0.2830482850628954, "learning_rate": 4.7074468085106385e-05, "loss": 0.4634, "step": 288 }, { "epoch": 0.4609250398724083, "grad_norm": 0.32155923758745747, "learning_rate": 4.704491725768322e-05, "loss": 0.4788, "step": 289 }, { "epoch": 0.4625199362041467, "grad_norm": 0.30560536250650194, "learning_rate": 4.701536643026005e-05, "loss": 0.4784, "step": 290 }, { "epoch": 0.46411483253588515, "grad_norm": 0.3086507860577177, "learning_rate": 4.698581560283688e-05, "loss": 0.4608, "step": 291 }, { "epoch": 0.46570972886762363, "grad_norm": 0.3216991774569081, "learning_rate": 4.6956264775413714e-05, "loss": 0.4495, "step": 292 }, { "epoch": 0.46730462519936206, "grad_norm": 0.32355805820787265, "learning_rate": 4.692671394799055e-05, "loss": 0.4759, "step": 293 }, { "epoch": 0.4688995215311005, "grad_norm": 0.3145132987153644, "learning_rate": 4.6897163120567375e-05, "loss": 0.4621, "step": 294 }, { "epoch": 0.4704944178628389, "grad_norm": 0.2755154155976271, "learning_rate": 4.686761229314421e-05, "loss": 0.4663, "step": 295 }, { "epoch": 0.47208931419457734, "grad_norm": 0.30563715354534393, "learning_rate": 4.6838061465721044e-05, "loss": 0.4575, "step": 296 }, { "epoch": 0.47368421052631576, "grad_norm": 0.3144318544599538, "learning_rate": 4.680851063829788e-05, "loss": 0.4523, "step": 297 }, { "epoch": 0.47527910685805425, "grad_norm": 0.281436904779696, "learning_rate": 4.6778959810874705e-05, "loss": 0.457, "step": 298 }, { "epoch": 0.4768740031897927, "grad_norm": 0.2946600874688791, "learning_rate": 4.674940898345154e-05, "loss": 0.4654, "step": 299 }, { "epoch": 0.4784688995215311, "grad_norm": 0.2817273315501034, "learning_rate": 4.671985815602837e-05, "loss": 0.4541, "step": 300 }, { "epoch": 0.4800637958532695, "grad_norm": 0.31377040794397604, "learning_rate": 4.669030732860521e-05, "loss": 0.4685, "step": 301 }, { "epoch": 0.48165869218500795, "grad_norm": 0.3001850673848778, "learning_rate": 4.6660756501182034e-05, "loss": 0.45, "step": 302 }, { "epoch": 0.48325358851674644, "grad_norm": 0.24283496038908764, "learning_rate": 4.663120567375887e-05, "loss": 0.4531, "step": 303 }, { "epoch": 0.48484848484848486, "grad_norm": 0.33216152829153434, "learning_rate": 4.66016548463357e-05, "loss": 0.4649, "step": 304 }, { "epoch": 0.4864433811802233, "grad_norm": 0.25272307331304017, "learning_rate": 4.657210401891253e-05, "loss": 0.4672, "step": 305 }, { "epoch": 0.4880382775119617, "grad_norm": 0.282382611925632, "learning_rate": 4.6542553191489364e-05, "loss": 0.4421, "step": 306 }, { "epoch": 0.48963317384370014, "grad_norm": 0.2901435271856519, "learning_rate": 4.651300236406619e-05, "loss": 0.4684, "step": 307 }, { "epoch": 0.49122807017543857, "grad_norm": 0.275330972483419, "learning_rate": 4.648345153664303e-05, "loss": 0.4772, "step": 308 }, { "epoch": 0.49282296650717705, "grad_norm": 0.2863176285759138, "learning_rate": 4.645390070921986e-05, "loss": 0.4662, "step": 309 }, { "epoch": 0.4944178628389155, "grad_norm": 0.25957737485049864, "learning_rate": 4.642434988179669e-05, "loss": 0.4603, "step": 310 }, { "epoch": 0.4960127591706539, "grad_norm": 0.2788236121056464, "learning_rate": 4.639479905437352e-05, "loss": 0.4639, "step": 311 }, { "epoch": 0.49760765550239233, "grad_norm": 0.26843556527288503, "learning_rate": 4.636524822695036e-05, "loss": 0.466, "step": 312 }, { "epoch": 0.49920255183413076, "grad_norm": 0.29210712036252223, "learning_rate": 4.633569739952719e-05, "loss": 0.4617, "step": 313 }, { "epoch": 0.5007974481658692, "grad_norm": 0.2785255834737654, "learning_rate": 4.630614657210402e-05, "loss": 0.4731, "step": 314 }, { "epoch": 0.5023923444976076, "grad_norm": 0.28757488345547566, "learning_rate": 4.627659574468085e-05, "loss": 0.4504, "step": 315 }, { "epoch": 0.5039872408293461, "grad_norm": 0.2932406743138567, "learning_rate": 4.6247044917257684e-05, "loss": 0.4573, "step": 316 }, { "epoch": 0.5055821371610846, "grad_norm": 0.2978203726362237, "learning_rate": 4.621749408983452e-05, "loss": 0.4613, "step": 317 }, { "epoch": 0.507177033492823, "grad_norm": 0.26872272801882247, "learning_rate": 4.6187943262411345e-05, "loss": 0.4525, "step": 318 }, { "epoch": 0.5087719298245614, "grad_norm": 0.29879886965357155, "learning_rate": 4.615839243498818e-05, "loss": 0.4551, "step": 319 }, { "epoch": 0.5103668261562998, "grad_norm": 0.2484041772066918, "learning_rate": 4.6128841607565014e-05, "loss": 0.4474, "step": 320 }, { "epoch": 0.5119617224880383, "grad_norm": 0.3270193875819324, "learning_rate": 4.609929078014185e-05, "loss": 0.452, "step": 321 }, { "epoch": 0.5135566188197768, "grad_norm": 0.2617615152914598, "learning_rate": 4.6069739952718675e-05, "loss": 0.4453, "step": 322 }, { "epoch": 0.5151515151515151, "grad_norm": 0.2736301891274981, "learning_rate": 4.6040189125295516e-05, "loss": 0.45, "step": 323 }, { "epoch": 0.5167464114832536, "grad_norm": 0.2905010919843766, "learning_rate": 4.601063829787234e-05, "loss": 0.4647, "step": 324 }, { "epoch": 0.518341307814992, "grad_norm": 0.274413521705795, "learning_rate": 4.598108747044918e-05, "loss": 0.4729, "step": 325 }, { "epoch": 0.5199362041467305, "grad_norm": 0.2760717526917849, "learning_rate": 4.5951536643026004e-05, "loss": 0.4339, "step": 326 }, { "epoch": 0.5215311004784688, "grad_norm": 0.2782488671990798, "learning_rate": 4.592198581560284e-05, "loss": 0.4657, "step": 327 }, { "epoch": 0.5231259968102073, "grad_norm": 0.3099002542201853, "learning_rate": 4.589243498817967e-05, "loss": 0.4597, "step": 328 }, { "epoch": 0.5247208931419458, "grad_norm": 0.2857298164168067, "learning_rate": 4.58628841607565e-05, "loss": 0.4631, "step": 329 }, { "epoch": 0.5263157894736842, "grad_norm": 0.30095504564733305, "learning_rate": 4.5833333333333334e-05, "loss": 0.457, "step": 330 }, { "epoch": 0.5279106858054227, "grad_norm": 0.25687262631192304, "learning_rate": 4.580378250591017e-05, "loss": 0.457, "step": 331 }, { "epoch": 0.529505582137161, "grad_norm": 0.24389240580723723, "learning_rate": 4.5774231678487e-05, "loss": 0.4334, "step": 332 }, { "epoch": 0.5311004784688995, "grad_norm": 0.2874951855080189, "learning_rate": 4.574468085106383e-05, "loss": 0.4557, "step": 333 }, { "epoch": 0.532695374800638, "grad_norm": 0.2512842839762162, "learning_rate": 4.571513002364066e-05, "loss": 0.4527, "step": 334 }, { "epoch": 0.5342902711323764, "grad_norm": 0.2949199790081541, "learning_rate": 4.56855791962175e-05, "loss": 0.4727, "step": 335 }, { "epoch": 0.5358851674641149, "grad_norm": 0.2843782006978418, "learning_rate": 4.565602836879433e-05, "loss": 0.457, "step": 336 }, { "epoch": 0.5374800637958532, "grad_norm": 0.2714925089988213, "learning_rate": 4.562647754137116e-05, "loss": 0.4557, "step": 337 }, { "epoch": 0.5390749601275917, "grad_norm": 0.25901739219956965, "learning_rate": 4.559692671394799e-05, "loss": 0.4524, "step": 338 }, { "epoch": 0.5406698564593302, "grad_norm": 0.2657087038698919, "learning_rate": 4.556737588652483e-05, "loss": 0.4578, "step": 339 }, { "epoch": 0.5422647527910686, "grad_norm": 0.2529354722690404, "learning_rate": 4.553782505910166e-05, "loss": 0.4426, "step": 340 }, { "epoch": 0.543859649122807, "grad_norm": 0.2504981964224946, "learning_rate": 4.550827423167849e-05, "loss": 0.4474, "step": 341 }, { "epoch": 0.5454545454545454, "grad_norm": 0.2592588671790806, "learning_rate": 4.547872340425532e-05, "loss": 0.4404, "step": 342 }, { "epoch": 0.5470494417862839, "grad_norm": 0.26276602410399647, "learning_rate": 4.5449172576832156e-05, "loss": 0.4375, "step": 343 }, { "epoch": 0.5486443381180224, "grad_norm": 0.26437661208789554, "learning_rate": 4.5419621749408984e-05, "loss": 0.4559, "step": 344 }, { "epoch": 0.5502392344497608, "grad_norm": 0.28889591924393754, "learning_rate": 4.539007092198582e-05, "loss": 0.4609, "step": 345 }, { "epoch": 0.5518341307814992, "grad_norm": 0.2963529264363304, "learning_rate": 4.5360520094562645e-05, "loss": 0.4668, "step": 346 }, { "epoch": 0.5534290271132376, "grad_norm": 0.3221235189169606, "learning_rate": 4.5330969267139486e-05, "loss": 0.4595, "step": 347 }, { "epoch": 0.5550239234449761, "grad_norm": 0.2584399076350894, "learning_rate": 4.530141843971631e-05, "loss": 0.4536, "step": 348 }, { "epoch": 0.5566188197767146, "grad_norm": 0.3259765155101947, "learning_rate": 4.527186761229315e-05, "loss": 0.4615, "step": 349 }, { "epoch": 0.5582137161084529, "grad_norm": 0.2962944728063518, "learning_rate": 4.5242316784869974e-05, "loss": 0.4551, "step": 350 }, { "epoch": 0.5598086124401914, "grad_norm": 0.2685151656200771, "learning_rate": 4.5212765957446815e-05, "loss": 0.4545, "step": 351 }, { "epoch": 0.5614035087719298, "grad_norm": 0.3406827661248063, "learning_rate": 4.518321513002364e-05, "loss": 0.4641, "step": 352 }, { "epoch": 0.5629984051036683, "grad_norm": 0.2713809289536123, "learning_rate": 4.5153664302600477e-05, "loss": 0.4743, "step": 353 }, { "epoch": 0.5645933014354066, "grad_norm": 0.27739354009299066, "learning_rate": 4.512411347517731e-05, "loss": 0.4593, "step": 354 }, { "epoch": 0.5661881977671451, "grad_norm": 0.3087256322849153, "learning_rate": 4.509456264775414e-05, "loss": 0.4579, "step": 355 }, { "epoch": 0.5677830940988836, "grad_norm": 0.26272653276386104, "learning_rate": 4.506501182033097e-05, "loss": 0.4503, "step": 356 }, { "epoch": 0.569377990430622, "grad_norm": 0.3093310271579678, "learning_rate": 4.50354609929078e-05, "loss": 0.4761, "step": 357 }, { "epoch": 0.5709728867623605, "grad_norm": 0.25317012896957414, "learning_rate": 4.500591016548464e-05, "loss": 0.4348, "step": 358 }, { "epoch": 0.5725677830940988, "grad_norm": 0.31714604700193205, "learning_rate": 4.497635933806147e-05, "loss": 0.4383, "step": 359 }, { "epoch": 0.5741626794258373, "grad_norm": 0.2475236257287617, "learning_rate": 4.49468085106383e-05, "loss": 0.4394, "step": 360 }, { "epoch": 0.5757575757575758, "grad_norm": 0.3037524472998184, "learning_rate": 4.491725768321513e-05, "loss": 0.4511, "step": 361 }, { "epoch": 0.5773524720893142, "grad_norm": 0.30342727078816883, "learning_rate": 4.488770685579197e-05, "loss": 0.4662, "step": 362 }, { "epoch": 0.5789473684210527, "grad_norm": 0.298449885355356, "learning_rate": 4.48581560283688e-05, "loss": 0.462, "step": 363 }, { "epoch": 0.580542264752791, "grad_norm": 0.3123321782978768, "learning_rate": 4.482860520094563e-05, "loss": 0.4701, "step": 364 }, { "epoch": 0.5821371610845295, "grad_norm": 0.2934972470107859, "learning_rate": 4.479905437352246e-05, "loss": 0.4759, "step": 365 }, { "epoch": 0.583732057416268, "grad_norm": 0.2757659545699461, "learning_rate": 4.476950354609929e-05, "loss": 0.4611, "step": 366 }, { "epoch": 0.5853269537480064, "grad_norm": 0.2772926382790529, "learning_rate": 4.4739952718676126e-05, "loss": 0.4526, "step": 367 }, { "epoch": 0.5869218500797448, "grad_norm": 0.2875720749731135, "learning_rate": 4.4710401891252954e-05, "loss": 0.4382, "step": 368 }, { "epoch": 0.5885167464114832, "grad_norm": 0.2521710882136664, "learning_rate": 4.468085106382979e-05, "loss": 0.4413, "step": 369 }, { "epoch": 0.5901116427432217, "grad_norm": 0.3169315592059172, "learning_rate": 4.465130023640662e-05, "loss": 0.4576, "step": 370 }, { "epoch": 0.5917065390749602, "grad_norm": 0.2611355816858306, "learning_rate": 4.4621749408983456e-05, "loss": 0.4667, "step": 371 }, { "epoch": 0.5933014354066986, "grad_norm": 0.2897218093512622, "learning_rate": 4.459219858156028e-05, "loss": 0.4483, "step": 372 }, { "epoch": 0.594896331738437, "grad_norm": 0.2638980841729459, "learning_rate": 4.456264775413712e-05, "loss": 0.4508, "step": 373 }, { "epoch": 0.5964912280701754, "grad_norm": 0.274319103716694, "learning_rate": 4.453309692671395e-05, "loss": 0.4574, "step": 374 }, { "epoch": 0.5980861244019139, "grad_norm": 0.27713261776616444, "learning_rate": 4.4503546099290785e-05, "loss": 0.4649, "step": 375 }, { "epoch": 0.5996810207336523, "grad_norm": 0.28217239066399435, "learning_rate": 4.447399527186761e-05, "loss": 0.455, "step": 376 }, { "epoch": 0.6012759170653907, "grad_norm": 0.26262484663584224, "learning_rate": 4.4444444444444447e-05, "loss": 0.455, "step": 377 }, { "epoch": 0.6028708133971292, "grad_norm": 0.31735346501666967, "learning_rate": 4.441489361702128e-05, "loss": 0.4644, "step": 378 }, { "epoch": 0.6044657097288676, "grad_norm": 0.28093850070714993, "learning_rate": 4.438534278959811e-05, "loss": 0.4527, "step": 379 }, { "epoch": 0.6060606060606061, "grad_norm": 0.31901144170568785, "learning_rate": 4.435579196217494e-05, "loss": 0.4586, "step": 380 }, { "epoch": 0.6076555023923444, "grad_norm": 0.29089440710491776, "learning_rate": 4.432624113475177e-05, "loss": 0.4461, "step": 381 }, { "epoch": 0.6092503987240829, "grad_norm": 0.2503898046211875, "learning_rate": 4.429669030732861e-05, "loss": 0.4419, "step": 382 }, { "epoch": 0.6108452950558214, "grad_norm": 0.32293195847638423, "learning_rate": 4.426713947990544e-05, "loss": 0.4687, "step": 383 }, { "epoch": 0.6124401913875598, "grad_norm": 0.27362514133759086, "learning_rate": 4.423758865248227e-05, "loss": 0.449, "step": 384 }, { "epoch": 0.6140350877192983, "grad_norm": 0.2787943977024225, "learning_rate": 4.4208037825059106e-05, "loss": 0.4425, "step": 385 }, { "epoch": 0.6156299840510366, "grad_norm": 0.275848808295457, "learning_rate": 4.417848699763594e-05, "loss": 0.4453, "step": 386 }, { "epoch": 0.6172248803827751, "grad_norm": 0.27084161451865435, "learning_rate": 4.414893617021277e-05, "loss": 0.4517, "step": 387 }, { "epoch": 0.6188197767145136, "grad_norm": 0.2789843245095661, "learning_rate": 4.41193853427896e-05, "loss": 0.4463, "step": 388 }, { "epoch": 0.620414673046252, "grad_norm": 0.26329628442651404, "learning_rate": 4.4089834515366435e-05, "loss": 0.4474, "step": 389 }, { "epoch": 0.6220095693779905, "grad_norm": 0.3017703147681768, "learning_rate": 4.406028368794327e-05, "loss": 0.4586, "step": 390 }, { "epoch": 0.6236044657097288, "grad_norm": 0.28329654946232574, "learning_rate": 4.4030732860520096e-05, "loss": 0.4415, "step": 391 }, { "epoch": 0.6251993620414673, "grad_norm": 0.29905910134078906, "learning_rate": 4.400118203309693e-05, "loss": 0.4515, "step": 392 }, { "epoch": 0.6267942583732058, "grad_norm": 0.2499414006467597, "learning_rate": 4.3971631205673764e-05, "loss": 0.4334, "step": 393 }, { "epoch": 0.6283891547049442, "grad_norm": 0.30096834334260153, "learning_rate": 4.394208037825059e-05, "loss": 0.4518, "step": 394 }, { "epoch": 0.6299840510366826, "grad_norm": 0.23343903316193784, "learning_rate": 4.3912529550827426e-05, "loss": 0.4431, "step": 395 }, { "epoch": 0.631578947368421, "grad_norm": 0.31137184707271565, "learning_rate": 4.388297872340425e-05, "loss": 0.4524, "step": 396 }, { "epoch": 0.6331738437001595, "grad_norm": 0.22859412281581318, "learning_rate": 4.3853427895981094e-05, "loss": 0.422, "step": 397 }, { "epoch": 0.6347687400318979, "grad_norm": 0.3252410370829895, "learning_rate": 4.382387706855792e-05, "loss": 0.4599, "step": 398 }, { "epoch": 0.6363636363636364, "grad_norm": 0.23629738722267288, "learning_rate": 4.3794326241134755e-05, "loss": 0.4579, "step": 399 }, { "epoch": 0.6379585326953748, "grad_norm": 0.33572214007009177, "learning_rate": 4.376477541371158e-05, "loss": 0.4491, "step": 400 }, { "epoch": 0.6395534290271132, "grad_norm": 0.28680509925186526, "learning_rate": 4.373522458628842e-05, "loss": 0.4609, "step": 401 }, { "epoch": 0.6411483253588517, "grad_norm": 0.29552820572415844, "learning_rate": 4.370567375886525e-05, "loss": 0.4415, "step": 402 }, { "epoch": 0.6427432216905901, "grad_norm": 0.2646633949886807, "learning_rate": 4.3676122931442085e-05, "loss": 0.4469, "step": 403 }, { "epoch": 0.6443381180223285, "grad_norm": 0.2657551697873454, "learning_rate": 4.364657210401891e-05, "loss": 0.455, "step": 404 }, { "epoch": 0.645933014354067, "grad_norm": 0.2794669581593632, "learning_rate": 4.3617021276595746e-05, "loss": 0.4433, "step": 405 }, { "epoch": 0.6475279106858054, "grad_norm": 0.29070057761655016, "learning_rate": 4.358747044917258e-05, "loss": 0.4589, "step": 406 }, { "epoch": 0.6491228070175439, "grad_norm": 0.30104399246415764, "learning_rate": 4.355791962174941e-05, "loss": 0.4541, "step": 407 }, { "epoch": 0.6507177033492823, "grad_norm": 0.3080215823950904, "learning_rate": 4.352836879432624e-05, "loss": 0.4523, "step": 408 }, { "epoch": 0.6523125996810207, "grad_norm": 0.27901904358740615, "learning_rate": 4.3498817966903076e-05, "loss": 0.4355, "step": 409 }, { "epoch": 0.6539074960127592, "grad_norm": 0.30620853988083685, "learning_rate": 4.346926713947991e-05, "loss": 0.4485, "step": 410 }, { "epoch": 0.6555023923444976, "grad_norm": 0.2594840472992986, "learning_rate": 4.343971631205674e-05, "loss": 0.4421, "step": 411 }, { "epoch": 0.6570972886762361, "grad_norm": 0.3126493623959291, "learning_rate": 4.341016548463357e-05, "loss": 0.4422, "step": 412 }, { "epoch": 0.6586921850079744, "grad_norm": 0.3006598166039491, "learning_rate": 4.3380614657210405e-05, "loss": 0.4569, "step": 413 }, { "epoch": 0.6602870813397129, "grad_norm": 0.3140925392565787, "learning_rate": 4.335106382978724e-05, "loss": 0.4483, "step": 414 }, { "epoch": 0.6618819776714514, "grad_norm": 0.30895503212911773, "learning_rate": 4.3321513002364066e-05, "loss": 0.4585, "step": 415 }, { "epoch": 0.6634768740031898, "grad_norm": 0.3318348573404951, "learning_rate": 4.32919621749409e-05, "loss": 0.4493, "step": 416 }, { "epoch": 0.6650717703349283, "grad_norm": 0.27220337418139073, "learning_rate": 4.3262411347517734e-05, "loss": 0.4563, "step": 417 }, { "epoch": 0.6666666666666666, "grad_norm": 0.35006564210565166, "learning_rate": 4.323286052009456e-05, "loss": 0.4461, "step": 418 }, { "epoch": 0.6682615629984051, "grad_norm": 0.29841440797078034, "learning_rate": 4.3203309692671396e-05, "loss": 0.4397, "step": 419 }, { "epoch": 0.6698564593301436, "grad_norm": 0.30487553003441464, "learning_rate": 4.317375886524823e-05, "loss": 0.4566, "step": 420 }, { "epoch": 0.671451355661882, "grad_norm": 0.2610861816055742, "learning_rate": 4.3144208037825064e-05, "loss": 0.4446, "step": 421 }, { "epoch": 0.6730462519936204, "grad_norm": 0.34509394216969697, "learning_rate": 4.311465721040189e-05, "loss": 0.4458, "step": 422 }, { "epoch": 0.6746411483253588, "grad_norm": 0.2671633648412727, "learning_rate": 4.3085106382978725e-05, "loss": 0.4438, "step": 423 }, { "epoch": 0.6762360446570973, "grad_norm": 0.29662162021383204, "learning_rate": 4.305555555555556e-05, "loss": 0.4565, "step": 424 }, { "epoch": 0.6778309409888357, "grad_norm": 0.30104871886751555, "learning_rate": 4.302600472813239e-05, "loss": 0.4376, "step": 425 }, { "epoch": 0.6794258373205742, "grad_norm": 0.3242241402566701, "learning_rate": 4.299645390070922e-05, "loss": 0.4748, "step": 426 }, { "epoch": 0.6810207336523126, "grad_norm": 0.30427522528681633, "learning_rate": 4.2966903073286055e-05, "loss": 0.4522, "step": 427 }, { "epoch": 0.682615629984051, "grad_norm": 0.35884115344483025, "learning_rate": 4.293735224586289e-05, "loss": 0.4633, "step": 428 }, { "epoch": 0.6842105263157895, "grad_norm": 0.2654370358556637, "learning_rate": 4.2907801418439716e-05, "loss": 0.4559, "step": 429 }, { "epoch": 0.6858054226475279, "grad_norm": 0.3381052508259645, "learning_rate": 4.287825059101655e-05, "loss": 0.4631, "step": 430 }, { "epoch": 0.6874003189792663, "grad_norm": 0.24555010525588164, "learning_rate": 4.284869976359338e-05, "loss": 0.4313, "step": 431 }, { "epoch": 0.6889952153110048, "grad_norm": 0.30352733266979637, "learning_rate": 4.281914893617022e-05, "loss": 0.4704, "step": 432 }, { "epoch": 0.6905901116427432, "grad_norm": 0.3623224363768784, "learning_rate": 4.2789598108747046e-05, "loss": 0.4563, "step": 433 }, { "epoch": 0.6921850079744817, "grad_norm": 0.24608614285169433, "learning_rate": 4.276004728132388e-05, "loss": 0.4267, "step": 434 }, { "epoch": 0.69377990430622, "grad_norm": 0.3318725831481799, "learning_rate": 4.273049645390071e-05, "loss": 0.4527, "step": 435 }, { "epoch": 0.6953748006379585, "grad_norm": 0.28469989857238004, "learning_rate": 4.270094562647755e-05, "loss": 0.4565, "step": 436 }, { "epoch": 0.696969696969697, "grad_norm": 0.2788766304107699, "learning_rate": 4.2671394799054375e-05, "loss": 0.4719, "step": 437 }, { "epoch": 0.6985645933014354, "grad_norm": 0.29898916320409485, "learning_rate": 4.264184397163121e-05, "loss": 0.4503, "step": 438 }, { "epoch": 0.7001594896331739, "grad_norm": 0.24696910624529847, "learning_rate": 4.2612293144208036e-05, "loss": 0.4317, "step": 439 }, { "epoch": 0.7017543859649122, "grad_norm": 0.29687918134488905, "learning_rate": 4.258274231678488e-05, "loss": 0.4415, "step": 440 }, { "epoch": 0.7033492822966507, "grad_norm": 0.24445642649729296, "learning_rate": 4.2553191489361704e-05, "loss": 0.4446, "step": 441 }, { "epoch": 0.7049441786283892, "grad_norm": 0.264584896512391, "learning_rate": 4.252364066193854e-05, "loss": 0.4549, "step": 442 }, { "epoch": 0.7065390749601276, "grad_norm": 0.262049879987939, "learning_rate": 4.2494089834515366e-05, "loss": 0.4498, "step": 443 }, { "epoch": 0.7081339712918661, "grad_norm": 0.2543409522922383, "learning_rate": 4.24645390070922e-05, "loss": 0.4425, "step": 444 }, { "epoch": 0.7097288676236044, "grad_norm": 0.2506745018564669, "learning_rate": 4.2434988179669034e-05, "loss": 0.4315, "step": 445 }, { "epoch": 0.7113237639553429, "grad_norm": 0.2816084377235084, "learning_rate": 4.240543735224586e-05, "loss": 0.4706, "step": 446 }, { "epoch": 0.7129186602870813, "grad_norm": 0.24499455120276784, "learning_rate": 4.2375886524822695e-05, "loss": 0.4355, "step": 447 }, { "epoch": 0.7145135566188198, "grad_norm": 0.26211649846850527, "learning_rate": 4.234633569739953e-05, "loss": 0.4308, "step": 448 }, { "epoch": 0.7161084529505582, "grad_norm": 0.25743703925089906, "learning_rate": 4.231678486997636e-05, "loss": 0.4611, "step": 449 }, { "epoch": 0.7177033492822966, "grad_norm": 0.3467589474054684, "learning_rate": 4.228723404255319e-05, "loss": 0.4488, "step": 450 }, { "epoch": 0.7192982456140351, "grad_norm": 0.28998268829920504, "learning_rate": 4.225768321513003e-05, "loss": 0.4554, "step": 451 }, { "epoch": 0.7208931419457735, "grad_norm": 0.33646398891005735, "learning_rate": 4.222813238770686e-05, "loss": 0.4581, "step": 452 }, { "epoch": 0.722488038277512, "grad_norm": 0.3007000055766117, "learning_rate": 4.219858156028369e-05, "loss": 0.4391, "step": 453 }, { "epoch": 0.7240829346092504, "grad_norm": 0.3141230656067291, "learning_rate": 4.216903073286052e-05, "loss": 0.455, "step": 454 }, { "epoch": 0.7256778309409888, "grad_norm": 0.30202206567216605, "learning_rate": 4.2139479905437354e-05, "loss": 0.4208, "step": 455 }, { "epoch": 0.7272727272727273, "grad_norm": 0.3016117811746841, "learning_rate": 4.210992907801419e-05, "loss": 0.435, "step": 456 }, { "epoch": 0.7288676236044657, "grad_norm": 0.321032298591508, "learning_rate": 4.2080378250591016e-05, "loss": 0.4534, "step": 457 }, { "epoch": 0.7304625199362041, "grad_norm": 0.2544633375576264, "learning_rate": 4.205082742316785e-05, "loss": 0.4377, "step": 458 }, { "epoch": 0.7320574162679426, "grad_norm": 0.3180068343245522, "learning_rate": 4.2021276595744684e-05, "loss": 0.4503, "step": 459 }, { "epoch": 0.733652312599681, "grad_norm": 0.30782391740803805, "learning_rate": 4.199172576832152e-05, "loss": 0.4496, "step": 460 }, { "epoch": 0.7352472089314195, "grad_norm": 0.26182611879174483, "learning_rate": 4.1962174940898345e-05, "loss": 0.4513, "step": 461 }, { "epoch": 0.7368421052631579, "grad_norm": 0.2884383632972822, "learning_rate": 4.193262411347518e-05, "loss": 0.4357, "step": 462 }, { "epoch": 0.7384370015948963, "grad_norm": 0.2840985164535703, "learning_rate": 4.190307328605201e-05, "loss": 0.4342, "step": 463 }, { "epoch": 0.7400318979266348, "grad_norm": 0.29766344557410324, "learning_rate": 4.187352245862885e-05, "loss": 0.4507, "step": 464 }, { "epoch": 0.7416267942583732, "grad_norm": 0.316794458807336, "learning_rate": 4.1843971631205674e-05, "loss": 0.4516, "step": 465 }, { "epoch": 0.7432216905901117, "grad_norm": 0.29735930419092776, "learning_rate": 4.181442080378251e-05, "loss": 0.4631, "step": 466 }, { "epoch": 0.74481658692185, "grad_norm": 0.32468678739210094, "learning_rate": 4.178486997635934e-05, "loss": 0.4587, "step": 467 }, { "epoch": 0.7464114832535885, "grad_norm": 0.26894246913502257, "learning_rate": 4.175531914893617e-05, "loss": 0.4512, "step": 468 }, { "epoch": 0.748006379585327, "grad_norm": 0.30595380138402173, "learning_rate": 4.1725768321513004e-05, "loss": 0.4646, "step": 469 }, { "epoch": 0.7496012759170654, "grad_norm": 0.2525561502449107, "learning_rate": 4.169621749408983e-05, "loss": 0.4491, "step": 470 }, { "epoch": 0.7511961722488039, "grad_norm": 0.27893079776836227, "learning_rate": 4.166666666666667e-05, "loss": 0.4534, "step": 471 }, { "epoch": 0.7527910685805422, "grad_norm": 0.2307286021701795, "learning_rate": 4.16371158392435e-05, "loss": 0.4396, "step": 472 }, { "epoch": 0.7543859649122807, "grad_norm": 0.26723716612692416, "learning_rate": 4.1607565011820333e-05, "loss": 0.4351, "step": 473 }, { "epoch": 0.7559808612440191, "grad_norm": 0.29024091168982546, "learning_rate": 4.157801418439716e-05, "loss": 0.4396, "step": 474 }, { "epoch": 0.7575757575757576, "grad_norm": 0.2831560741560005, "learning_rate": 4.1548463356974e-05, "loss": 0.4589, "step": 475 }, { "epoch": 0.759170653907496, "grad_norm": 0.2811367319146315, "learning_rate": 4.151891252955083e-05, "loss": 0.4437, "step": 476 }, { "epoch": 0.7607655502392344, "grad_norm": 0.2903851582200015, "learning_rate": 4.148936170212766e-05, "loss": 0.45, "step": 477 }, { "epoch": 0.7623604465709729, "grad_norm": 0.297402000566282, "learning_rate": 4.145981087470449e-05, "loss": 0.4559, "step": 478 }, { "epoch": 0.7639553429027113, "grad_norm": 0.31223395462369435, "learning_rate": 4.1430260047281324e-05, "loss": 0.4488, "step": 479 }, { "epoch": 0.7655502392344498, "grad_norm": 0.28209217262079195, "learning_rate": 4.140070921985816e-05, "loss": 0.4425, "step": 480 }, { "epoch": 0.7671451355661882, "grad_norm": 0.2952358924247107, "learning_rate": 4.1371158392434986e-05, "loss": 0.4442, "step": 481 }, { "epoch": 0.7687400318979266, "grad_norm": 0.23228541639026298, "learning_rate": 4.1341607565011826e-05, "loss": 0.4512, "step": 482 }, { "epoch": 0.7703349282296651, "grad_norm": 0.2754673209440299, "learning_rate": 4.1312056737588654e-05, "loss": 0.43, "step": 483 }, { "epoch": 0.7719298245614035, "grad_norm": 0.23544382076263562, "learning_rate": 4.128250591016549e-05, "loss": 0.431, "step": 484 }, { "epoch": 0.773524720893142, "grad_norm": 0.25826939861798565, "learning_rate": 4.1252955082742315e-05, "loss": 0.4463, "step": 485 }, { "epoch": 0.7751196172248804, "grad_norm": 0.2409564042224934, "learning_rate": 4.1223404255319156e-05, "loss": 0.4366, "step": 486 }, { "epoch": 0.7767145135566188, "grad_norm": 0.2436105352395618, "learning_rate": 4.119385342789598e-05, "loss": 0.4304, "step": 487 }, { "epoch": 0.7783094098883573, "grad_norm": 0.25229704193078945, "learning_rate": 4.116430260047282e-05, "loss": 0.4327, "step": 488 }, { "epoch": 0.7799043062200957, "grad_norm": 0.24279604147476647, "learning_rate": 4.1134751773049644e-05, "loss": 0.444, "step": 489 }, { "epoch": 0.7814992025518341, "grad_norm": 0.2585264810066929, "learning_rate": 4.1105200945626485e-05, "loss": 0.4257, "step": 490 }, { "epoch": 0.7830940988835726, "grad_norm": 0.26469084884900546, "learning_rate": 4.107565011820331e-05, "loss": 0.4537, "step": 491 }, { "epoch": 0.784688995215311, "grad_norm": 0.271148095568145, "learning_rate": 4.104609929078015e-05, "loss": 0.4326, "step": 492 }, { "epoch": 0.7862838915470495, "grad_norm": 0.25157528456641337, "learning_rate": 4.1016548463356974e-05, "loss": 0.44, "step": 493 }, { "epoch": 0.7878787878787878, "grad_norm": 0.25102821558725513, "learning_rate": 4.098699763593381e-05, "loss": 0.4466, "step": 494 }, { "epoch": 0.7894736842105263, "grad_norm": 0.23201546657262662, "learning_rate": 4.095744680851064e-05, "loss": 0.4229, "step": 495 }, { "epoch": 0.7910685805422647, "grad_norm": 0.2748278555739574, "learning_rate": 4.092789598108747e-05, "loss": 0.4566, "step": 496 }, { "epoch": 0.7926634768740032, "grad_norm": 0.23053641489566365, "learning_rate": 4.0898345153664303e-05, "loss": 0.4387, "step": 497 }, { "epoch": 0.7942583732057417, "grad_norm": 0.2548395208776042, "learning_rate": 4.086879432624114e-05, "loss": 0.4385, "step": 498 }, { "epoch": 0.79585326953748, "grad_norm": 0.2373529447873327, "learning_rate": 4.083924349881797e-05, "loss": 0.427, "step": 499 }, { "epoch": 0.7974481658692185, "grad_norm": 0.20973059532411292, "learning_rate": 4.08096926713948e-05, "loss": 0.437, "step": 500 }, { "epoch": 0.7990430622009569, "grad_norm": 0.24132303741053088, "learning_rate": 4.078014184397163e-05, "loss": 0.4414, "step": 501 }, { "epoch": 0.8006379585326954, "grad_norm": 0.2327336160260718, "learning_rate": 4.075059101654847e-05, "loss": 0.4374, "step": 502 }, { "epoch": 0.8022328548644339, "grad_norm": 0.2295917463113214, "learning_rate": 4.07210401891253e-05, "loss": 0.4424, "step": 503 }, { "epoch": 0.8038277511961722, "grad_norm": 0.218446797014509, "learning_rate": 4.069148936170213e-05, "loss": 0.4295, "step": 504 }, { "epoch": 0.8054226475279107, "grad_norm": 0.2612142884571014, "learning_rate": 4.066193853427896e-05, "loss": 0.4347, "step": 505 }, { "epoch": 0.8070175438596491, "grad_norm": 0.24966089870175914, "learning_rate": 4.0632387706855796e-05, "loss": 0.4392, "step": 506 }, { "epoch": 0.8086124401913876, "grad_norm": 0.24793522336486234, "learning_rate": 4.0602836879432624e-05, "loss": 0.4401, "step": 507 }, { "epoch": 0.810207336523126, "grad_norm": 0.2309911519885507, "learning_rate": 4.057328605200946e-05, "loss": 0.4184, "step": 508 }, { "epoch": 0.8118022328548644, "grad_norm": 0.26935835860378904, "learning_rate": 4.0543735224586285e-05, "loss": 0.427, "step": 509 }, { "epoch": 0.8133971291866029, "grad_norm": 0.2281102157838881, "learning_rate": 4.0514184397163126e-05, "loss": 0.4324, "step": 510 }, { "epoch": 0.8149920255183413, "grad_norm": 0.2572831457198334, "learning_rate": 4.048463356973995e-05, "loss": 0.4286, "step": 511 }, { "epoch": 0.8165869218500797, "grad_norm": 0.2636071716712076, "learning_rate": 4.045508274231679e-05, "loss": 0.4287, "step": 512 }, { "epoch": 0.8181818181818182, "grad_norm": 0.24541322689423015, "learning_rate": 4.0425531914893614e-05, "loss": 0.4408, "step": 513 }, { "epoch": 0.8197767145135566, "grad_norm": 0.26708875156000716, "learning_rate": 4.0395981087470455e-05, "loss": 0.4449, "step": 514 }, { "epoch": 0.8213716108452951, "grad_norm": 0.24156647868649428, "learning_rate": 4.036643026004728e-05, "loss": 0.4587, "step": 515 }, { "epoch": 0.8229665071770335, "grad_norm": 0.22928294697419785, "learning_rate": 4.033687943262412e-05, "loss": 0.4426, "step": 516 }, { "epoch": 0.8245614035087719, "grad_norm": 0.2356843309351196, "learning_rate": 4.030732860520095e-05, "loss": 0.4412, "step": 517 }, { "epoch": 0.8261562998405104, "grad_norm": 0.23793495344859192, "learning_rate": 4.027777777777778e-05, "loss": 0.4181, "step": 518 }, { "epoch": 0.8277511961722488, "grad_norm": 0.23382151494622733, "learning_rate": 4.024822695035461e-05, "loss": 0.4343, "step": 519 }, { "epoch": 0.8293460925039873, "grad_norm": 0.2917353123238932, "learning_rate": 4.021867612293144e-05, "loss": 0.4601, "step": 520 }, { "epoch": 0.8309409888357256, "grad_norm": 0.23188595731537046, "learning_rate": 4.018912529550828e-05, "loss": 0.4645, "step": 521 }, { "epoch": 0.8325358851674641, "grad_norm": 0.2559297245287039, "learning_rate": 4.015957446808511e-05, "loss": 0.4315, "step": 522 }, { "epoch": 0.8341307814992025, "grad_norm": 0.26087760344484096, "learning_rate": 4.013002364066194e-05, "loss": 0.45, "step": 523 }, { "epoch": 0.835725677830941, "grad_norm": 0.2537429802100604, "learning_rate": 4.010047281323877e-05, "loss": 0.4619, "step": 524 }, { "epoch": 0.8373205741626795, "grad_norm": 0.27907758696777357, "learning_rate": 4.007092198581561e-05, "loss": 0.4532, "step": 525 }, { "epoch": 0.8389154704944178, "grad_norm": 0.25136556619529127, "learning_rate": 4.004137115839244e-05, "loss": 0.4397, "step": 526 }, { "epoch": 0.8405103668261563, "grad_norm": 0.23874563136071106, "learning_rate": 4.001182033096927e-05, "loss": 0.423, "step": 527 }, { "epoch": 0.8421052631578947, "grad_norm": 0.2686872945039607, "learning_rate": 3.99822695035461e-05, "loss": 0.4296, "step": 528 }, { "epoch": 0.8437001594896332, "grad_norm": 0.2695968806847084, "learning_rate": 3.995271867612293e-05, "loss": 0.4375, "step": 529 }, { "epoch": 0.8452950558213717, "grad_norm": 0.24472145088946354, "learning_rate": 3.9923167848699766e-05, "loss": 0.4753, "step": 530 }, { "epoch": 0.84688995215311, "grad_norm": 0.2797571804779977, "learning_rate": 3.9893617021276594e-05, "loss": 0.439, "step": 531 }, { "epoch": 0.8484848484848485, "grad_norm": 0.2750903663991349, "learning_rate": 3.986406619385343e-05, "loss": 0.4513, "step": 532 }, { "epoch": 0.8500797448165869, "grad_norm": 0.27881485031640996, "learning_rate": 3.983451536643026e-05, "loss": 0.4536, "step": 533 }, { "epoch": 0.8516746411483254, "grad_norm": 0.26332424760606077, "learning_rate": 3.9804964539007096e-05, "loss": 0.4413, "step": 534 }, { "epoch": 0.8532695374800638, "grad_norm": 0.2454342334365751, "learning_rate": 3.977541371158392e-05, "loss": 0.4306, "step": 535 }, { "epoch": 0.8548644338118022, "grad_norm": 0.25025741666239, "learning_rate": 3.974586288416076e-05, "loss": 0.4374, "step": 536 }, { "epoch": 0.8564593301435407, "grad_norm": 0.24131107799548113, "learning_rate": 3.971631205673759e-05, "loss": 0.4435, "step": 537 }, { "epoch": 0.8580542264752791, "grad_norm": 0.25248289766475734, "learning_rate": 3.9686761229314425e-05, "loss": 0.4424, "step": 538 }, { "epoch": 0.8596491228070176, "grad_norm": 0.2314598072664094, "learning_rate": 3.965721040189125e-05, "loss": 0.4443, "step": 539 }, { "epoch": 0.861244019138756, "grad_norm": 0.2627851506758425, "learning_rate": 3.962765957446809e-05, "loss": 0.4413, "step": 540 }, { "epoch": 0.8628389154704944, "grad_norm": 0.24313270099998077, "learning_rate": 3.959810874704492e-05, "loss": 0.4407, "step": 541 }, { "epoch": 0.8644338118022329, "grad_norm": 0.24388508831636363, "learning_rate": 3.9568557919621755e-05, "loss": 0.4382, "step": 542 }, { "epoch": 0.8660287081339713, "grad_norm": 0.24456624891582118, "learning_rate": 3.953900709219858e-05, "loss": 0.4584, "step": 543 }, { "epoch": 0.8676236044657097, "grad_norm": 0.29383414444690736, "learning_rate": 3.9509456264775416e-05, "loss": 0.4303, "step": 544 }, { "epoch": 0.8692185007974481, "grad_norm": 0.24721865517967204, "learning_rate": 3.947990543735225e-05, "loss": 0.4247, "step": 545 }, { "epoch": 0.8708133971291866, "grad_norm": 0.27303970341333117, "learning_rate": 3.945035460992908e-05, "loss": 0.4424, "step": 546 }, { "epoch": 0.8724082934609251, "grad_norm": 0.319839983971564, "learning_rate": 3.942080378250591e-05, "loss": 0.4562, "step": 547 }, { "epoch": 0.8740031897926634, "grad_norm": 0.24701604123396062, "learning_rate": 3.9391252955082746e-05, "loss": 0.441, "step": 548 }, { "epoch": 0.8755980861244019, "grad_norm": 0.28836433071285483, "learning_rate": 3.936170212765958e-05, "loss": 0.4183, "step": 549 }, { "epoch": 0.8771929824561403, "grad_norm": 0.25242894574167063, "learning_rate": 3.933215130023641e-05, "loss": 0.4504, "step": 550 }, { "epoch": 0.8787878787878788, "grad_norm": 0.2758683903715597, "learning_rate": 3.930260047281324e-05, "loss": 0.4346, "step": 551 }, { "epoch": 0.8803827751196173, "grad_norm": 0.2765871818480754, "learning_rate": 3.9273049645390075e-05, "loss": 0.4292, "step": 552 }, { "epoch": 0.8819776714513556, "grad_norm": 0.24706246075324767, "learning_rate": 3.924349881796691e-05, "loss": 0.4152, "step": 553 }, { "epoch": 0.8835725677830941, "grad_norm": 0.31122642130395084, "learning_rate": 3.9213947990543736e-05, "loss": 0.4446, "step": 554 }, { "epoch": 0.8851674641148325, "grad_norm": 0.2574315862980409, "learning_rate": 3.918439716312057e-05, "loss": 0.4478, "step": 555 }, { "epoch": 0.886762360446571, "grad_norm": 0.2683503743898351, "learning_rate": 3.9154846335697405e-05, "loss": 0.4494, "step": 556 }, { "epoch": 0.8883572567783095, "grad_norm": 0.2910257606492979, "learning_rate": 3.912529550827423e-05, "loss": 0.4467, "step": 557 }, { "epoch": 0.8899521531100478, "grad_norm": 0.2566632366499328, "learning_rate": 3.9095744680851066e-05, "loss": 0.4426, "step": 558 }, { "epoch": 0.8915470494417863, "grad_norm": 0.3064104444675826, "learning_rate": 3.906619385342789e-05, "loss": 0.4125, "step": 559 }, { "epoch": 0.8931419457735247, "grad_norm": 0.2718104743425938, "learning_rate": 3.9036643026004734e-05, "loss": 0.4308, "step": 560 }, { "epoch": 0.8947368421052632, "grad_norm": 0.24957821645325126, "learning_rate": 3.900709219858156e-05, "loss": 0.4325, "step": 561 }, { "epoch": 0.8963317384370016, "grad_norm": 0.26357079771287745, "learning_rate": 3.8977541371158395e-05, "loss": 0.4285, "step": 562 }, { "epoch": 0.89792663476874, "grad_norm": 0.2539458037799794, "learning_rate": 3.894799054373522e-05, "loss": 0.4396, "step": 563 }, { "epoch": 0.8995215311004785, "grad_norm": 0.2618716776190623, "learning_rate": 3.8918439716312063e-05, "loss": 0.4278, "step": 564 }, { "epoch": 0.9011164274322169, "grad_norm": 0.28847500770121853, "learning_rate": 3.888888888888889e-05, "loss": 0.4539, "step": 565 }, { "epoch": 0.9027113237639554, "grad_norm": 0.22868361449937838, "learning_rate": 3.8859338061465725e-05, "loss": 0.4528, "step": 566 }, { "epoch": 0.9043062200956937, "grad_norm": 0.29708813298242115, "learning_rate": 3.882978723404255e-05, "loss": 0.4449, "step": 567 }, { "epoch": 0.9059011164274322, "grad_norm": 0.2875954586570484, "learning_rate": 3.8800236406619386e-05, "loss": 0.4425, "step": 568 }, { "epoch": 0.9074960127591707, "grad_norm": 0.25934724910293894, "learning_rate": 3.877068557919622e-05, "loss": 0.4488, "step": 569 }, { "epoch": 0.9090909090909091, "grad_norm": 0.26276551384063085, "learning_rate": 3.874113475177305e-05, "loss": 0.4579, "step": 570 }, { "epoch": 0.9106858054226475, "grad_norm": 0.2643123942709685, "learning_rate": 3.871158392434988e-05, "loss": 0.4586, "step": 571 }, { "epoch": 0.9122807017543859, "grad_norm": 0.23528611525125062, "learning_rate": 3.8682033096926716e-05, "loss": 0.4279, "step": 572 }, { "epoch": 0.9138755980861244, "grad_norm": 0.27243173881995403, "learning_rate": 3.865248226950355e-05, "loss": 0.4487, "step": 573 }, { "epoch": 0.9154704944178629, "grad_norm": 0.2556239343960528, "learning_rate": 3.862293144208038e-05, "loss": 0.4422, "step": 574 }, { "epoch": 0.9170653907496013, "grad_norm": 0.2704068453027759, "learning_rate": 3.859338061465721e-05, "loss": 0.4641, "step": 575 }, { "epoch": 0.9186602870813397, "grad_norm": 0.24895300209901458, "learning_rate": 3.8563829787234045e-05, "loss": 0.4329, "step": 576 }, { "epoch": 0.9202551834130781, "grad_norm": 0.28672716323221636, "learning_rate": 3.853427895981088e-05, "loss": 0.4331, "step": 577 }, { "epoch": 0.9218500797448166, "grad_norm": 0.24448644248212276, "learning_rate": 3.8504728132387706e-05, "loss": 0.4204, "step": 578 }, { "epoch": 0.9234449760765551, "grad_norm": 0.24234946544890915, "learning_rate": 3.847517730496454e-05, "loss": 0.4571, "step": 579 }, { "epoch": 0.9250398724082934, "grad_norm": 0.27904457252405757, "learning_rate": 3.8445626477541375e-05, "loss": 0.4172, "step": 580 }, { "epoch": 0.9266347687400319, "grad_norm": 0.2360355918322322, "learning_rate": 3.84160756501182e-05, "loss": 0.4432, "step": 581 }, { "epoch": 0.9282296650717703, "grad_norm": 0.272550600806342, "learning_rate": 3.8386524822695036e-05, "loss": 0.443, "step": 582 }, { "epoch": 0.9298245614035088, "grad_norm": 0.28519304764013914, "learning_rate": 3.835697399527187e-05, "loss": 0.4405, "step": 583 }, { "epoch": 0.9314194577352473, "grad_norm": 0.21496860669602322, "learning_rate": 3.8327423167848704e-05, "loss": 0.4271, "step": 584 }, { "epoch": 0.9330143540669856, "grad_norm": 0.29153051273394154, "learning_rate": 3.829787234042553e-05, "loss": 0.4331, "step": 585 }, { "epoch": 0.9346092503987241, "grad_norm": 0.23703455281838925, "learning_rate": 3.8268321513002365e-05, "loss": 0.4196, "step": 586 }, { "epoch": 0.9362041467304625, "grad_norm": 0.22869354759539506, "learning_rate": 3.82387706855792e-05, "loss": 0.4454, "step": 587 }, { "epoch": 0.937799043062201, "grad_norm": 0.26093798796852186, "learning_rate": 3.8209219858156033e-05, "loss": 0.4227, "step": 588 }, { "epoch": 0.9393939393939394, "grad_norm": 0.2426074804568585, "learning_rate": 3.817966903073286e-05, "loss": 0.4573, "step": 589 }, { "epoch": 0.9409888357256778, "grad_norm": 0.26763632273019244, "learning_rate": 3.8150118203309695e-05, "loss": 0.4265, "step": 590 }, { "epoch": 0.9425837320574163, "grad_norm": 0.25201351587012605, "learning_rate": 3.812056737588653e-05, "loss": 0.4354, "step": 591 }, { "epoch": 0.9441786283891547, "grad_norm": 0.266956509843455, "learning_rate": 3.809101654846336e-05, "loss": 0.4562, "step": 592 }, { "epoch": 0.9457735247208932, "grad_norm": 0.2534959735257615, "learning_rate": 3.806146572104019e-05, "loss": 0.4373, "step": 593 }, { "epoch": 0.9473684210526315, "grad_norm": 0.29400374759940356, "learning_rate": 3.8031914893617024e-05, "loss": 0.445, "step": 594 }, { "epoch": 0.94896331738437, "grad_norm": 0.22290207846432825, "learning_rate": 3.800236406619386e-05, "loss": 0.4453, "step": 595 }, { "epoch": 0.9505582137161085, "grad_norm": 0.27087109568219253, "learning_rate": 3.7972813238770686e-05, "loss": 0.4385, "step": 596 }, { "epoch": 0.9521531100478469, "grad_norm": 0.23885429664281385, "learning_rate": 3.794326241134752e-05, "loss": 0.4333, "step": 597 }, { "epoch": 0.9537480063795853, "grad_norm": 0.2877568927402139, "learning_rate": 3.791371158392435e-05, "loss": 0.4293, "step": 598 }, { "epoch": 0.9553429027113237, "grad_norm": 0.28158650123987766, "learning_rate": 3.788416075650119e-05, "loss": 0.4403, "step": 599 }, { "epoch": 0.9569377990430622, "grad_norm": 0.2610274383714583, "learning_rate": 3.7854609929078015e-05, "loss": 0.4365, "step": 600 }, { "epoch": 0.9585326953748007, "grad_norm": 0.3392494091036515, "learning_rate": 3.782505910165485e-05, "loss": 0.4384, "step": 601 }, { "epoch": 0.960127591706539, "grad_norm": 0.2942833073998688, "learning_rate": 3.7795508274231676e-05, "loss": 0.4275, "step": 602 }, { "epoch": 0.9617224880382775, "grad_norm": 0.323649709240853, "learning_rate": 3.776595744680852e-05, "loss": 0.4268, "step": 603 }, { "epoch": 0.9633173843700159, "grad_norm": 0.25983651383124684, "learning_rate": 3.7736406619385345e-05, "loss": 0.4418, "step": 604 }, { "epoch": 0.9649122807017544, "grad_norm": 0.3271856987871099, "learning_rate": 3.770685579196218e-05, "loss": 0.424, "step": 605 }, { "epoch": 0.9665071770334929, "grad_norm": 0.21097164983290156, "learning_rate": 3.7677304964539006e-05, "loss": 0.4102, "step": 606 }, { "epoch": 0.9681020733652312, "grad_norm": 0.31206499359821144, "learning_rate": 3.764775413711584e-05, "loss": 0.446, "step": 607 }, { "epoch": 0.9696969696969697, "grad_norm": 0.25993280158554694, "learning_rate": 3.7618203309692674e-05, "loss": 0.4304, "step": 608 }, { "epoch": 0.9712918660287081, "grad_norm": 0.28885554143293485, "learning_rate": 3.75886524822695e-05, "loss": 0.4344, "step": 609 }, { "epoch": 0.9728867623604466, "grad_norm": 0.270947750008668, "learning_rate": 3.7559101654846335e-05, "loss": 0.4173, "step": 610 }, { "epoch": 0.9744816586921851, "grad_norm": 0.2811799812301913, "learning_rate": 3.752955082742317e-05, "loss": 0.4376, "step": 611 }, { "epoch": 0.9760765550239234, "grad_norm": 0.27027101407581633, "learning_rate": 3.7500000000000003e-05, "loss": 0.436, "step": 612 }, { "epoch": 0.9776714513556619, "grad_norm": 0.29200693557421864, "learning_rate": 3.747044917257683e-05, "loss": 0.4392, "step": 613 }, { "epoch": 0.9792663476874003, "grad_norm": 0.2617340825243244, "learning_rate": 3.744089834515367e-05, "loss": 0.4358, "step": 614 }, { "epoch": 0.9808612440191388, "grad_norm": 0.2540475100218611, "learning_rate": 3.74113475177305e-05, "loss": 0.4469, "step": 615 }, { "epoch": 0.9824561403508771, "grad_norm": 0.2618622092207595, "learning_rate": 3.738179669030733e-05, "loss": 0.4285, "step": 616 }, { "epoch": 0.9840510366826156, "grad_norm": 0.2168004609963206, "learning_rate": 3.735224586288416e-05, "loss": 0.4485, "step": 617 }, { "epoch": 0.9856459330143541, "grad_norm": 0.2611786271623147, "learning_rate": 3.7322695035460994e-05, "loss": 0.4474, "step": 618 }, { "epoch": 0.9872408293460925, "grad_norm": 0.25317030109556626, "learning_rate": 3.729314420803783e-05, "loss": 0.4636, "step": 619 }, { "epoch": 0.988835725677831, "grad_norm": 0.19566519856353548, "learning_rate": 3.7263593380614656e-05, "loss": 0.4313, "step": 620 }, { "epoch": 0.9904306220095693, "grad_norm": 0.2710955108893377, "learning_rate": 3.723404255319149e-05, "loss": 0.4302, "step": 621 }, { "epoch": 0.9920255183413078, "grad_norm": 0.2274281708833089, "learning_rate": 3.7204491725768324e-05, "loss": 0.4208, "step": 622 }, { "epoch": 0.9936204146730463, "grad_norm": 0.23973240503260088, "learning_rate": 3.717494089834516e-05, "loss": 0.4359, "step": 623 }, { "epoch": 0.9952153110047847, "grad_norm": 0.2703522738982665, "learning_rate": 3.7145390070921985e-05, "loss": 0.4607, "step": 624 }, { "epoch": 0.9968102073365231, "grad_norm": 0.21625253911480288, "learning_rate": 3.711583924349882e-05, "loss": 0.4347, "step": 625 }, { "epoch": 0.9984051036682615, "grad_norm": 0.22299741244031324, "learning_rate": 3.708628841607565e-05, "loss": 0.4255, "step": 626 }, { "epoch": 1.0, "grad_norm": 0.2407231398431226, "learning_rate": 3.705673758865249e-05, "loss": 0.4429, "step": 627 }, { "epoch": 1.0015948963317385, "grad_norm": 0.27726884639567906, "learning_rate": 3.7027186761229315e-05, "loss": 0.3626, "step": 628 }, { "epoch": 1.003189792663477, "grad_norm": 0.23466298525010204, "learning_rate": 3.699763593380615e-05, "loss": 0.3717, "step": 629 }, { "epoch": 1.0047846889952152, "grad_norm": 0.2568486585992098, "learning_rate": 3.696808510638298e-05, "loss": 0.379, "step": 630 }, { "epoch": 1.0063795853269537, "grad_norm": 0.2752453064902119, "learning_rate": 3.693853427895981e-05, "loss": 0.383, "step": 631 }, { "epoch": 1.0079744816586922, "grad_norm": 0.29608706040381005, "learning_rate": 3.6908983451536644e-05, "loss": 0.3718, "step": 632 }, { "epoch": 1.0095693779904307, "grad_norm": 0.2984924416752924, "learning_rate": 3.687943262411347e-05, "loss": 0.3592, "step": 633 }, { "epoch": 1.0111642743221692, "grad_norm": 0.24953683848086036, "learning_rate": 3.684988179669031e-05, "loss": 0.3769, "step": 634 }, { "epoch": 1.0127591706539074, "grad_norm": 0.2810795374140877, "learning_rate": 3.682033096926714e-05, "loss": 0.3463, "step": 635 }, { "epoch": 1.014354066985646, "grad_norm": 0.27038390507279186, "learning_rate": 3.6790780141843974e-05, "loss": 0.3653, "step": 636 }, { "epoch": 1.0159489633173844, "grad_norm": 0.235397524730165, "learning_rate": 3.67612293144208e-05, "loss": 0.3521, "step": 637 }, { "epoch": 1.0175438596491229, "grad_norm": 0.28933903660097177, "learning_rate": 3.673167848699764e-05, "loss": 0.3725, "step": 638 }, { "epoch": 1.0191387559808613, "grad_norm": 0.22783384625759381, "learning_rate": 3.670212765957447e-05, "loss": 0.3573, "step": 639 }, { "epoch": 1.0207336523125996, "grad_norm": 0.24083429224923925, "learning_rate": 3.66725768321513e-05, "loss": 0.3651, "step": 640 }, { "epoch": 1.022328548644338, "grad_norm": 0.2397785152101506, "learning_rate": 3.664302600472813e-05, "loss": 0.3652, "step": 641 }, { "epoch": 1.0239234449760766, "grad_norm": 0.2436661142986919, "learning_rate": 3.661347517730497e-05, "loss": 0.3646, "step": 642 }, { "epoch": 1.025518341307815, "grad_norm": 0.23874473300984694, "learning_rate": 3.65839243498818e-05, "loss": 0.375, "step": 643 }, { "epoch": 1.0271132376395535, "grad_norm": 0.25546612645146477, "learning_rate": 3.655437352245863e-05, "loss": 0.3642, "step": 644 }, { "epoch": 1.0287081339712918, "grad_norm": 0.27196340138701186, "learning_rate": 3.6524822695035466e-05, "loss": 0.3786, "step": 645 }, { "epoch": 1.0303030303030303, "grad_norm": 0.2623261141992434, "learning_rate": 3.6495271867612294e-05, "loss": 0.3764, "step": 646 }, { "epoch": 1.0318979266347688, "grad_norm": 0.22728380623821925, "learning_rate": 3.646572104018913e-05, "loss": 0.3537, "step": 647 }, { "epoch": 1.0334928229665072, "grad_norm": 0.2693931717718797, "learning_rate": 3.6436170212765955e-05, "loss": 0.3806, "step": 648 }, { "epoch": 1.0350877192982457, "grad_norm": 0.25002115813046016, "learning_rate": 3.6406619385342796e-05, "loss": 0.3771, "step": 649 }, { "epoch": 1.036682615629984, "grad_norm": 0.23311024091245958, "learning_rate": 3.637706855791962e-05, "loss": 0.3671, "step": 650 }, { "epoch": 1.0382775119617225, "grad_norm": 0.2199809800530286, "learning_rate": 3.634751773049646e-05, "loss": 0.3854, "step": 651 }, { "epoch": 1.039872408293461, "grad_norm": 0.23216917667108627, "learning_rate": 3.6317966903073285e-05, "loss": 0.3632, "step": 652 }, { "epoch": 1.0414673046251994, "grad_norm": 0.25361651540867836, "learning_rate": 3.6288416075650125e-05, "loss": 0.3597, "step": 653 }, { "epoch": 1.0430622009569377, "grad_norm": 0.23329901954268092, "learning_rate": 3.625886524822695e-05, "loss": 0.3541, "step": 654 }, { "epoch": 1.0446570972886762, "grad_norm": 0.21875484363131792, "learning_rate": 3.622931442080379e-05, "loss": 0.3454, "step": 655 }, { "epoch": 1.0462519936204147, "grad_norm": 0.264068265252116, "learning_rate": 3.6199763593380614e-05, "loss": 0.3884, "step": 656 }, { "epoch": 1.0478468899521531, "grad_norm": 0.26233761076277257, "learning_rate": 3.617021276595745e-05, "loss": 0.3757, "step": 657 }, { "epoch": 1.0494417862838916, "grad_norm": 0.2739144050314966, "learning_rate": 3.614066193853428e-05, "loss": 0.3705, "step": 658 }, { "epoch": 1.0510366826156299, "grad_norm": 0.21672834548462072, "learning_rate": 3.611111111111111e-05, "loss": 0.3641, "step": 659 }, { "epoch": 1.0526315789473684, "grad_norm": 0.288186852123436, "learning_rate": 3.6081560283687944e-05, "loss": 0.3785, "step": 660 }, { "epoch": 1.0542264752791068, "grad_norm": 0.25803200295097567, "learning_rate": 3.605200945626478e-05, "loss": 0.3939, "step": 661 }, { "epoch": 1.0558213716108453, "grad_norm": 0.2500331149238507, "learning_rate": 3.602245862884161e-05, "loss": 0.3615, "step": 662 }, { "epoch": 1.0574162679425838, "grad_norm": 0.2425148953862105, "learning_rate": 3.599290780141844e-05, "loss": 0.3752, "step": 663 }, { "epoch": 1.059011164274322, "grad_norm": 0.2714274500076988, "learning_rate": 3.596335697399527e-05, "loss": 0.3691, "step": 664 }, { "epoch": 1.0606060606060606, "grad_norm": 0.24632741482294027, "learning_rate": 3.593380614657211e-05, "loss": 0.3669, "step": 665 }, { "epoch": 1.062200956937799, "grad_norm": 0.2404372438412565, "learning_rate": 3.590425531914894e-05, "loss": 0.3661, "step": 666 }, { "epoch": 1.0637958532695375, "grad_norm": 0.25899783458673126, "learning_rate": 3.587470449172577e-05, "loss": 0.3581, "step": 667 }, { "epoch": 1.065390749601276, "grad_norm": 0.2573067255085275, "learning_rate": 3.58451536643026e-05, "loss": 0.3643, "step": 668 }, { "epoch": 1.0669856459330143, "grad_norm": 0.23874980115831448, "learning_rate": 3.5815602836879437e-05, "loss": 0.3706, "step": 669 }, { "epoch": 1.0685805422647527, "grad_norm": 0.2430257210940274, "learning_rate": 3.5786052009456264e-05, "loss": 0.3589, "step": 670 }, { "epoch": 1.0701754385964912, "grad_norm": 0.2427962422349631, "learning_rate": 3.57565011820331e-05, "loss": 0.3585, "step": 671 }, { "epoch": 1.0717703349282297, "grad_norm": 0.21735580568653753, "learning_rate": 3.5726950354609925e-05, "loss": 0.3612, "step": 672 }, { "epoch": 1.0733652312599682, "grad_norm": 0.23330324919194168, "learning_rate": 3.5697399527186766e-05, "loss": 0.3387, "step": 673 }, { "epoch": 1.0749601275917064, "grad_norm": 0.22956906934592516, "learning_rate": 3.566784869976359e-05, "loss": 0.3681, "step": 674 }, { "epoch": 1.076555023923445, "grad_norm": 0.24001049172039943, "learning_rate": 3.563829787234043e-05, "loss": 0.3612, "step": 675 }, { "epoch": 1.0781499202551834, "grad_norm": 0.27723064303509914, "learning_rate": 3.5608747044917255e-05, "loss": 0.383, "step": 676 }, { "epoch": 1.079744816586922, "grad_norm": 0.24965263415973382, "learning_rate": 3.5579196217494095e-05, "loss": 0.3561, "step": 677 }, { "epoch": 1.0813397129186604, "grad_norm": 0.2445965975659902, "learning_rate": 3.554964539007092e-05, "loss": 0.3672, "step": 678 }, { "epoch": 1.0829346092503986, "grad_norm": 0.2825305551871455, "learning_rate": 3.552009456264776e-05, "loss": 0.3664, "step": 679 }, { "epoch": 1.0845295055821371, "grad_norm": 0.2320059186043351, "learning_rate": 3.549054373522459e-05, "loss": 0.3822, "step": 680 }, { "epoch": 1.0861244019138756, "grad_norm": 0.237577769422602, "learning_rate": 3.546099290780142e-05, "loss": 0.36, "step": 681 }, { "epoch": 1.087719298245614, "grad_norm": 0.27327324568834366, "learning_rate": 3.543144208037825e-05, "loss": 0.3711, "step": 682 }, { "epoch": 1.0893141945773526, "grad_norm": 0.2093574340018733, "learning_rate": 3.540189125295508e-05, "loss": 0.3461, "step": 683 }, { "epoch": 1.0909090909090908, "grad_norm": 0.2750465052784354, "learning_rate": 3.537234042553192e-05, "loss": 0.3721, "step": 684 }, { "epoch": 1.0925039872408293, "grad_norm": 0.2289680456726834, "learning_rate": 3.534278959810875e-05, "loss": 0.3487, "step": 685 }, { "epoch": 1.0940988835725678, "grad_norm": 0.27520722512098234, "learning_rate": 3.531323877068558e-05, "loss": 0.3779, "step": 686 }, { "epoch": 1.0956937799043063, "grad_norm": 0.21149920653480161, "learning_rate": 3.528368794326241e-05, "loss": 0.3623, "step": 687 }, { "epoch": 1.0972886762360448, "grad_norm": 0.2746921831869083, "learning_rate": 3.525413711583925e-05, "loss": 0.3693, "step": 688 }, { "epoch": 1.098883572567783, "grad_norm": 0.3664969594311578, "learning_rate": 3.522458628841608e-05, "loss": 0.3691, "step": 689 }, { "epoch": 1.1004784688995215, "grad_norm": 0.25872604887064704, "learning_rate": 3.519503546099291e-05, "loss": 0.3719, "step": 690 }, { "epoch": 1.10207336523126, "grad_norm": 0.24452483858079846, "learning_rate": 3.516548463356974e-05, "loss": 0.373, "step": 691 }, { "epoch": 1.1036682615629985, "grad_norm": 0.2889630507900151, "learning_rate": 3.513593380614658e-05, "loss": 0.3753, "step": 692 }, { "epoch": 1.1052631578947367, "grad_norm": 0.23499007845399447, "learning_rate": 3.5106382978723407e-05, "loss": 0.3697, "step": 693 }, { "epoch": 1.1068580542264752, "grad_norm": 0.2734905896158299, "learning_rate": 3.507683215130024e-05, "loss": 0.3666, "step": 694 }, { "epoch": 1.1084529505582137, "grad_norm": 0.2537058889906659, "learning_rate": 3.504728132387707e-05, "loss": 0.3417, "step": 695 }, { "epoch": 1.1100478468899522, "grad_norm": 0.28930400704884734, "learning_rate": 3.50177304964539e-05, "loss": 0.3694, "step": 696 }, { "epoch": 1.1116427432216907, "grad_norm": 0.23925015462873972, "learning_rate": 3.4988179669030736e-05, "loss": 0.3797, "step": 697 }, { "epoch": 1.1132376395534291, "grad_norm": 0.2886961014331654, "learning_rate": 3.495862884160756e-05, "loss": 0.3802, "step": 698 }, { "epoch": 1.1148325358851674, "grad_norm": 0.23246199184364766, "learning_rate": 3.49290780141844e-05, "loss": 0.3723, "step": 699 }, { "epoch": 1.1164274322169059, "grad_norm": 0.25158043021924015, "learning_rate": 3.489952718676123e-05, "loss": 0.3731, "step": 700 }, { "epoch": 1.1180223285486444, "grad_norm": 0.26425450286502083, "learning_rate": 3.4869976359338065e-05, "loss": 0.3567, "step": 701 }, { "epoch": 1.1196172248803828, "grad_norm": 0.21683882158909096, "learning_rate": 3.484042553191489e-05, "loss": 0.3716, "step": 702 }, { "epoch": 1.121212121212121, "grad_norm": 0.23600372096411598, "learning_rate": 3.481087470449173e-05, "loss": 0.3543, "step": 703 }, { "epoch": 1.1228070175438596, "grad_norm": 0.24817483976041105, "learning_rate": 3.478132387706856e-05, "loss": 0.3789, "step": 704 }, { "epoch": 1.124401913875598, "grad_norm": 0.26032396693688337, "learning_rate": 3.4751773049645395e-05, "loss": 0.3606, "step": 705 }, { "epoch": 1.1259968102073366, "grad_norm": 0.25122119095786105, "learning_rate": 3.472222222222222e-05, "loss": 0.331, "step": 706 }, { "epoch": 1.127591706539075, "grad_norm": 0.2456669931837283, "learning_rate": 3.4692671394799056e-05, "loss": 0.3732, "step": 707 }, { "epoch": 1.1291866028708135, "grad_norm": 0.2509621838088167, "learning_rate": 3.466312056737589e-05, "loss": 0.3813, "step": 708 }, { "epoch": 1.1307814992025518, "grad_norm": 0.23750560485981792, "learning_rate": 3.463356973995272e-05, "loss": 0.3827, "step": 709 }, { "epoch": 1.1323763955342903, "grad_norm": 0.22252076959092335, "learning_rate": 3.460401891252955e-05, "loss": 0.3694, "step": 710 }, { "epoch": 1.1339712918660287, "grad_norm": 0.2395019889848866, "learning_rate": 3.4574468085106386e-05, "loss": 0.3511, "step": 711 }, { "epoch": 1.1355661881977672, "grad_norm": 0.2659563662985954, "learning_rate": 3.454491725768322e-05, "loss": 0.3771, "step": 712 }, { "epoch": 1.1371610845295055, "grad_norm": 0.2210257746062931, "learning_rate": 3.451536643026005e-05, "loss": 0.3711, "step": 713 }, { "epoch": 1.138755980861244, "grad_norm": 0.3019194603165432, "learning_rate": 3.448581560283688e-05, "loss": 0.369, "step": 714 }, { "epoch": 1.1403508771929824, "grad_norm": 0.24571601294291673, "learning_rate": 3.4456264775413715e-05, "loss": 0.3632, "step": 715 }, { "epoch": 1.141945773524721, "grad_norm": 0.24154571422860385, "learning_rate": 3.442671394799055e-05, "loss": 0.3986, "step": 716 }, { "epoch": 1.1435406698564594, "grad_norm": 0.2506801355605688, "learning_rate": 3.4397163120567377e-05, "loss": 0.3723, "step": 717 }, { "epoch": 1.1451355661881977, "grad_norm": 0.2511708617933504, "learning_rate": 3.436761229314421e-05, "loss": 0.382, "step": 718 }, { "epoch": 1.1467304625199362, "grad_norm": 0.23796035304084084, "learning_rate": 3.4338061465721045e-05, "loss": 0.3823, "step": 719 }, { "epoch": 1.1483253588516746, "grad_norm": 0.26179331022056795, "learning_rate": 3.430851063829787e-05, "loss": 0.404, "step": 720 }, { "epoch": 1.1499202551834131, "grad_norm": 0.24405242215608658, "learning_rate": 3.4278959810874706e-05, "loss": 0.3655, "step": 721 }, { "epoch": 1.1515151515151516, "grad_norm": 0.22940621322209792, "learning_rate": 3.424940898345153e-05, "loss": 0.3553, "step": 722 }, { "epoch": 1.1531100478468899, "grad_norm": 0.25932523913782085, "learning_rate": 3.4219858156028374e-05, "loss": 0.3865, "step": 723 }, { "epoch": 1.1547049441786283, "grad_norm": 0.2170420991980907, "learning_rate": 3.41903073286052e-05, "loss": 0.3718, "step": 724 }, { "epoch": 1.1562998405103668, "grad_norm": 0.22672105143076451, "learning_rate": 3.4160756501182035e-05, "loss": 0.3798, "step": 725 }, { "epoch": 1.1578947368421053, "grad_norm": 0.24952527180498132, "learning_rate": 3.413120567375886e-05, "loss": 0.3894, "step": 726 }, { "epoch": 1.1594896331738438, "grad_norm": 0.21297108995142244, "learning_rate": 3.4101654846335704e-05, "loss": 0.3646, "step": 727 }, { "epoch": 1.161084529505582, "grad_norm": 0.22396767457211925, "learning_rate": 3.407210401891253e-05, "loss": 0.3661, "step": 728 }, { "epoch": 1.1626794258373205, "grad_norm": 0.234782611507149, "learning_rate": 3.4042553191489365e-05, "loss": 0.3587, "step": 729 }, { "epoch": 1.164274322169059, "grad_norm": 0.23187775562633872, "learning_rate": 3.401300236406619e-05, "loss": 0.3651, "step": 730 }, { "epoch": 1.1658692185007975, "grad_norm": 0.24391192962543787, "learning_rate": 3.3983451536643026e-05, "loss": 0.3774, "step": 731 }, { "epoch": 1.167464114832536, "grad_norm": 0.27964004532596554, "learning_rate": 3.395390070921986e-05, "loss": 0.3715, "step": 732 }, { "epoch": 1.1690590111642742, "grad_norm": 0.22123123806003994, "learning_rate": 3.392434988179669e-05, "loss": 0.3774, "step": 733 }, { "epoch": 1.1706539074960127, "grad_norm": 0.24809315457578637, "learning_rate": 3.389479905437352e-05, "loss": 0.3526, "step": 734 }, { "epoch": 1.1722488038277512, "grad_norm": 0.2275305631364456, "learning_rate": 3.3865248226950356e-05, "loss": 0.3989, "step": 735 }, { "epoch": 1.1738437001594897, "grad_norm": 0.23891631279625167, "learning_rate": 3.383569739952719e-05, "loss": 0.3733, "step": 736 }, { "epoch": 1.1754385964912282, "grad_norm": 0.23550479008795383, "learning_rate": 3.380614657210402e-05, "loss": 0.3599, "step": 737 }, { "epoch": 1.1770334928229664, "grad_norm": 0.24354970655745586, "learning_rate": 3.377659574468085e-05, "loss": 0.3716, "step": 738 }, { "epoch": 1.178628389154705, "grad_norm": 0.22969561677712402, "learning_rate": 3.3747044917257685e-05, "loss": 0.364, "step": 739 }, { "epoch": 1.1802232854864434, "grad_norm": 0.24081862943180918, "learning_rate": 3.371749408983452e-05, "loss": 0.364, "step": 740 }, { "epoch": 1.1818181818181819, "grad_norm": 0.2694659434228782, "learning_rate": 3.3687943262411347e-05, "loss": 0.3548, "step": 741 }, { "epoch": 1.1834130781499201, "grad_norm": 0.24002196053226527, "learning_rate": 3.365839243498818e-05, "loss": 0.3561, "step": 742 }, { "epoch": 1.1850079744816586, "grad_norm": 0.2847416740833475, "learning_rate": 3.3628841607565015e-05, "loss": 0.37, "step": 743 }, { "epoch": 1.186602870813397, "grad_norm": 0.23312883808575718, "learning_rate": 3.359929078014185e-05, "loss": 0.3646, "step": 744 }, { "epoch": 1.1881977671451356, "grad_norm": 0.22653319699325578, "learning_rate": 3.3569739952718676e-05, "loss": 0.3695, "step": 745 }, { "epoch": 1.189792663476874, "grad_norm": 0.24774733870180485, "learning_rate": 3.354018912529551e-05, "loss": 0.3644, "step": 746 }, { "epoch": 1.1913875598086126, "grad_norm": 0.24403985801614203, "learning_rate": 3.3510638297872344e-05, "loss": 0.3841, "step": 747 }, { "epoch": 1.1929824561403508, "grad_norm": 0.20879404738923762, "learning_rate": 3.348108747044917e-05, "loss": 0.3783, "step": 748 }, { "epoch": 1.1945773524720893, "grad_norm": 0.23295592754181643, "learning_rate": 3.3451536643026005e-05, "loss": 0.3835, "step": 749 }, { "epoch": 1.1961722488038278, "grad_norm": 0.2439738248650689, "learning_rate": 3.342198581560284e-05, "loss": 0.3757, "step": 750 }, { "epoch": 1.1977671451355663, "grad_norm": 0.23171616155053887, "learning_rate": 3.3392434988179674e-05, "loss": 0.364, "step": 751 }, { "epoch": 1.1993620414673045, "grad_norm": 0.21935998007070784, "learning_rate": 3.33628841607565e-05, "loss": 0.377, "step": 752 }, { "epoch": 1.200956937799043, "grad_norm": 0.2289256173233073, "learning_rate": 3.3333333333333335e-05, "loss": 0.3401, "step": 753 }, { "epoch": 1.2025518341307815, "grad_norm": 0.23731698914124275, "learning_rate": 3.330378250591017e-05, "loss": 0.3688, "step": 754 }, { "epoch": 1.20414673046252, "grad_norm": 0.2177037125978804, "learning_rate": 3.3274231678487e-05, "loss": 0.3608, "step": 755 }, { "epoch": 1.2057416267942584, "grad_norm": 0.2263553571670602, "learning_rate": 3.324468085106383e-05, "loss": 0.384, "step": 756 }, { "epoch": 1.207336523125997, "grad_norm": 0.26177334245885076, "learning_rate": 3.3215130023640664e-05, "loss": 0.3788, "step": 757 }, { "epoch": 1.2089314194577352, "grad_norm": 0.23603850599114234, "learning_rate": 3.31855791962175e-05, "loss": 0.3655, "step": 758 }, { "epoch": 1.2105263157894737, "grad_norm": 0.22726849599112645, "learning_rate": 3.3156028368794326e-05, "loss": 0.3577, "step": 759 }, { "epoch": 1.2121212121212122, "grad_norm": 0.22627875431870742, "learning_rate": 3.312647754137116e-05, "loss": 0.3512, "step": 760 }, { "epoch": 1.2137161084529506, "grad_norm": 0.23763659386080344, "learning_rate": 3.309692671394799e-05, "loss": 0.3771, "step": 761 }, { "epoch": 1.215311004784689, "grad_norm": 0.2209940029207144, "learning_rate": 3.306737588652483e-05, "loss": 0.362, "step": 762 }, { "epoch": 1.2169059011164274, "grad_norm": 0.23084046129025892, "learning_rate": 3.3037825059101655e-05, "loss": 0.3649, "step": 763 }, { "epoch": 1.2185007974481659, "grad_norm": 0.2347049270537266, "learning_rate": 3.300827423167849e-05, "loss": 0.3662, "step": 764 }, { "epoch": 1.2200956937799043, "grad_norm": 0.2179418521588421, "learning_rate": 3.2978723404255317e-05, "loss": 0.369, "step": 765 }, { "epoch": 1.2216905901116428, "grad_norm": 0.25794777113688894, "learning_rate": 3.294917257683216e-05, "loss": 0.3781, "step": 766 }, { "epoch": 1.223285486443381, "grad_norm": 0.2377302533879308, "learning_rate": 3.2919621749408985e-05, "loss": 0.387, "step": 767 }, { "epoch": 1.2248803827751196, "grad_norm": 0.2753042032690551, "learning_rate": 3.289007092198582e-05, "loss": 0.3714, "step": 768 }, { "epoch": 1.226475279106858, "grad_norm": 0.23984345523981382, "learning_rate": 3.2860520094562646e-05, "loss": 0.351, "step": 769 }, { "epoch": 1.2280701754385965, "grad_norm": 0.22572809275587047, "learning_rate": 3.283096926713948e-05, "loss": 0.3676, "step": 770 }, { "epoch": 1.229665071770335, "grad_norm": 0.230831918771525, "learning_rate": 3.2801418439716314e-05, "loss": 0.371, "step": 771 }, { "epoch": 1.2312599681020733, "grad_norm": 0.2291653761132363, "learning_rate": 3.277186761229314e-05, "loss": 0.3598, "step": 772 }, { "epoch": 1.2328548644338118, "grad_norm": 0.2481689380688067, "learning_rate": 3.2742316784869975e-05, "loss": 0.3599, "step": 773 }, { "epoch": 1.2344497607655502, "grad_norm": 0.2380314416956892, "learning_rate": 3.271276595744681e-05, "loss": 0.3779, "step": 774 }, { "epoch": 1.2360446570972887, "grad_norm": 0.23232298805979357, "learning_rate": 3.2683215130023644e-05, "loss": 0.3731, "step": 775 }, { "epoch": 1.2376395534290272, "grad_norm": 0.23021880045853368, "learning_rate": 3.265366430260047e-05, "loss": 0.3465, "step": 776 }, { "epoch": 1.2392344497607655, "grad_norm": 0.24728413529056734, "learning_rate": 3.262411347517731e-05, "loss": 0.3888, "step": 777 }, { "epoch": 1.240829346092504, "grad_norm": 0.20108981096537443, "learning_rate": 3.259456264775414e-05, "loss": 0.3662, "step": 778 }, { "epoch": 1.2424242424242424, "grad_norm": 0.2329593659496395, "learning_rate": 3.256501182033097e-05, "loss": 0.3592, "step": 779 }, { "epoch": 1.244019138755981, "grad_norm": 0.21273060668704785, "learning_rate": 3.25354609929078e-05, "loss": 0.3724, "step": 780 }, { "epoch": 1.2456140350877192, "grad_norm": 0.22138253300133257, "learning_rate": 3.2505910165484634e-05, "loss": 0.3613, "step": 781 }, { "epoch": 1.2472089314194577, "grad_norm": 0.2289634443556758, "learning_rate": 3.247635933806147e-05, "loss": 0.3911, "step": 782 }, { "epoch": 1.2488038277511961, "grad_norm": 0.2244470578112504, "learning_rate": 3.2446808510638296e-05, "loss": 0.3644, "step": 783 }, { "epoch": 1.2503987240829346, "grad_norm": 0.2380767007221653, "learning_rate": 3.241725768321513e-05, "loss": 0.3576, "step": 784 }, { "epoch": 1.251993620414673, "grad_norm": 0.20949680273966304, "learning_rate": 3.2387706855791964e-05, "loss": 0.3633, "step": 785 }, { "epoch": 1.2535885167464116, "grad_norm": 0.24718748809012475, "learning_rate": 3.23581560283688e-05, "loss": 0.3679, "step": 786 }, { "epoch": 1.2551834130781498, "grad_norm": 0.22751340362839093, "learning_rate": 3.2328605200945625e-05, "loss": 0.347, "step": 787 }, { "epoch": 1.2567783094098883, "grad_norm": 0.2246533916947933, "learning_rate": 3.229905437352246e-05, "loss": 0.3534, "step": 788 }, { "epoch": 1.2583732057416268, "grad_norm": 0.23277305088896993, "learning_rate": 3.226950354609929e-05, "loss": 0.3538, "step": 789 }, { "epoch": 1.2599681020733653, "grad_norm": 0.22839009291665863, "learning_rate": 3.223995271867613e-05, "loss": 0.3686, "step": 790 }, { "epoch": 1.2615629984051036, "grad_norm": 0.22877689916972407, "learning_rate": 3.2210401891252955e-05, "loss": 0.3607, "step": 791 }, { "epoch": 1.263157894736842, "grad_norm": 0.24812467655672968, "learning_rate": 3.218085106382979e-05, "loss": 0.3727, "step": 792 }, { "epoch": 1.2647527910685805, "grad_norm": 0.21377481733767306, "learning_rate": 3.215130023640662e-05, "loss": 0.3611, "step": 793 }, { "epoch": 1.266347687400319, "grad_norm": 0.21718694357804283, "learning_rate": 3.212174940898346e-05, "loss": 0.3635, "step": 794 }, { "epoch": 1.2679425837320575, "grad_norm": 0.2333792089205317, "learning_rate": 3.2092198581560284e-05, "loss": 0.3515, "step": 795 }, { "epoch": 1.269537480063796, "grad_norm": 0.23895412675541292, "learning_rate": 3.206264775413712e-05, "loss": 0.3699, "step": 796 }, { "epoch": 1.2711323763955342, "grad_norm": 0.21390182375095934, "learning_rate": 3.203309692671395e-05, "loss": 0.3537, "step": 797 }, { "epoch": 1.2727272727272727, "grad_norm": 0.22429102191031, "learning_rate": 3.200354609929078e-05, "loss": 0.3643, "step": 798 }, { "epoch": 1.2743221690590112, "grad_norm": 0.2063260867045798, "learning_rate": 3.1973995271867614e-05, "loss": 0.3586, "step": 799 }, { "epoch": 1.2759170653907497, "grad_norm": 0.22667108203114916, "learning_rate": 3.194444444444444e-05, "loss": 0.3718, "step": 800 }, { "epoch": 1.277511961722488, "grad_norm": 0.22723654555834924, "learning_rate": 3.191489361702128e-05, "loss": 0.3757, "step": 801 }, { "epoch": 1.2791068580542264, "grad_norm": 0.20741412289894065, "learning_rate": 3.188534278959811e-05, "loss": 0.3833, "step": 802 }, { "epoch": 1.280701754385965, "grad_norm": 0.21938270589628048, "learning_rate": 3.185579196217494e-05, "loss": 0.3745, "step": 803 }, { "epoch": 1.2822966507177034, "grad_norm": 0.2186238927139606, "learning_rate": 3.182624113475177e-05, "loss": 0.3593, "step": 804 }, { "epoch": 1.2838915470494419, "grad_norm": 0.2320480558424794, "learning_rate": 3.179669030732861e-05, "loss": 0.3695, "step": 805 }, { "epoch": 1.2854864433811803, "grad_norm": 0.20904975321594463, "learning_rate": 3.176713947990544e-05, "loss": 0.3555, "step": 806 }, { "epoch": 1.2870813397129186, "grad_norm": 0.2054879260159393, "learning_rate": 3.173758865248227e-05, "loss": 0.3706, "step": 807 }, { "epoch": 1.288676236044657, "grad_norm": 0.22250690864654826, "learning_rate": 3.1708037825059107e-05, "loss": 0.3841, "step": 808 }, { "epoch": 1.2902711323763956, "grad_norm": 0.22456207363608047, "learning_rate": 3.1678486997635934e-05, "loss": 0.3779, "step": 809 }, { "epoch": 1.291866028708134, "grad_norm": 0.2182370954082192, "learning_rate": 3.164893617021277e-05, "loss": 0.3598, "step": 810 }, { "epoch": 1.2934609250398723, "grad_norm": 0.21135933476861124, "learning_rate": 3.1619385342789595e-05, "loss": 0.3592, "step": 811 }, { "epoch": 1.2950558213716108, "grad_norm": 0.21775258217611246, "learning_rate": 3.1589834515366436e-05, "loss": 0.3702, "step": 812 }, { "epoch": 1.2966507177033493, "grad_norm": 0.21340591844801288, "learning_rate": 3.156028368794326e-05, "loss": 0.3718, "step": 813 }, { "epoch": 1.2982456140350878, "grad_norm": 0.24685174418005584, "learning_rate": 3.15307328605201e-05, "loss": 0.3705, "step": 814 }, { "epoch": 1.2998405103668262, "grad_norm": 0.21631217176179396, "learning_rate": 3.1501182033096925e-05, "loss": 0.3757, "step": 815 }, { "epoch": 1.3014354066985647, "grad_norm": 0.21279126679371685, "learning_rate": 3.1471631205673766e-05, "loss": 0.3608, "step": 816 }, { "epoch": 1.303030303030303, "grad_norm": 0.22189759248469915, "learning_rate": 3.144208037825059e-05, "loss": 0.3503, "step": 817 }, { "epoch": 1.3046251993620415, "grad_norm": 0.21648496516087756, "learning_rate": 3.141252955082743e-05, "loss": 0.3622, "step": 818 }, { "epoch": 1.30622009569378, "grad_norm": 0.2016181846659238, "learning_rate": 3.1382978723404254e-05, "loss": 0.3643, "step": 819 }, { "epoch": 1.3078149920255182, "grad_norm": 0.21490664812943794, "learning_rate": 3.135342789598109e-05, "loss": 0.3562, "step": 820 }, { "epoch": 1.3094098883572567, "grad_norm": 0.20481766174671018, "learning_rate": 3.132387706855792e-05, "loss": 0.3573, "step": 821 }, { "epoch": 1.3110047846889952, "grad_norm": 0.20633733313709948, "learning_rate": 3.129432624113475e-05, "loss": 0.3727, "step": 822 }, { "epoch": 1.3125996810207337, "grad_norm": 0.21561422460044327, "learning_rate": 3.1264775413711584e-05, "loss": 0.3563, "step": 823 }, { "epoch": 1.3141945773524721, "grad_norm": 0.20399367456297554, "learning_rate": 3.123522458628842e-05, "loss": 0.3617, "step": 824 }, { "epoch": 1.3157894736842106, "grad_norm": 0.22134238783873814, "learning_rate": 3.120567375886525e-05, "loss": 0.3703, "step": 825 }, { "epoch": 1.3173843700159489, "grad_norm": 0.2287409235996658, "learning_rate": 3.117612293144208e-05, "loss": 0.3462, "step": 826 }, { "epoch": 1.3189792663476874, "grad_norm": 0.24026514384381348, "learning_rate": 3.114657210401891e-05, "loss": 0.3679, "step": 827 }, { "epoch": 1.3205741626794258, "grad_norm": 0.1938045400369095, "learning_rate": 3.111702127659575e-05, "loss": 0.3674, "step": 828 }, { "epoch": 1.3221690590111643, "grad_norm": 0.2270010359108024, "learning_rate": 3.108747044917258e-05, "loss": 0.3766, "step": 829 }, { "epoch": 1.3237639553429026, "grad_norm": 0.21354148473280898, "learning_rate": 3.105791962174941e-05, "loss": 0.3492, "step": 830 }, { "epoch": 1.325358851674641, "grad_norm": 0.22271173889319387, "learning_rate": 3.102836879432624e-05, "loss": 0.3728, "step": 831 }, { "epoch": 1.3269537480063796, "grad_norm": 0.20947747856652976, "learning_rate": 3.0998817966903077e-05, "loss": 0.3524, "step": 832 }, { "epoch": 1.328548644338118, "grad_norm": 0.1965947894563495, "learning_rate": 3.0969267139479904e-05, "loss": 0.3579, "step": 833 }, { "epoch": 1.3301435406698565, "grad_norm": 0.21654756355384666, "learning_rate": 3.093971631205674e-05, "loss": 0.3564, "step": 834 }, { "epoch": 1.331738437001595, "grad_norm": 0.22257736029845104, "learning_rate": 3.091016548463357e-05, "loss": 0.3637, "step": 835 }, { "epoch": 1.3333333333333333, "grad_norm": 0.1924699274374713, "learning_rate": 3.0880614657210406e-05, "loss": 0.364, "step": 836 }, { "epoch": 1.3349282296650717, "grad_norm": 0.22424684752297547, "learning_rate": 3.085106382978723e-05, "loss": 0.3669, "step": 837 }, { "epoch": 1.3365231259968102, "grad_norm": 0.20040811783367204, "learning_rate": 3.082151300236407e-05, "loss": 0.3597, "step": 838 }, { "epoch": 1.3381180223285487, "grad_norm": 0.2390726870428973, "learning_rate": 3.0791962174940895e-05, "loss": 0.3645, "step": 839 }, { "epoch": 1.339712918660287, "grad_norm": 0.19649254146037223, "learning_rate": 3.0762411347517736e-05, "loss": 0.3601, "step": 840 }, { "epoch": 1.3413078149920254, "grad_norm": 0.23295461077664475, "learning_rate": 3.073286052009456e-05, "loss": 0.3784, "step": 841 }, { "epoch": 1.342902711323764, "grad_norm": 0.2187818331709939, "learning_rate": 3.07033096926714e-05, "loss": 0.3631, "step": 842 }, { "epoch": 1.3444976076555024, "grad_norm": 0.21089918601805702, "learning_rate": 3.067375886524823e-05, "loss": 0.3765, "step": 843 }, { "epoch": 1.346092503987241, "grad_norm": 0.21886294519788524, "learning_rate": 3.0644208037825065e-05, "loss": 0.3527, "step": 844 }, { "epoch": 1.3476874003189794, "grad_norm": 0.22373456334143674, "learning_rate": 3.061465721040189e-05, "loss": 0.3727, "step": 845 }, { "epoch": 1.3492822966507176, "grad_norm": 0.19721594737013062, "learning_rate": 3.0585106382978726e-05, "loss": 0.354, "step": 846 }, { "epoch": 1.3508771929824561, "grad_norm": 0.24698550683041035, "learning_rate": 3.055555555555556e-05, "loss": 0.3645, "step": 847 }, { "epoch": 1.3524720893141946, "grad_norm": 0.24323500774461662, "learning_rate": 3.052600472813239e-05, "loss": 0.3616, "step": 848 }, { "epoch": 1.354066985645933, "grad_norm": 0.21782892486571712, "learning_rate": 3.0496453900709222e-05, "loss": 0.3611, "step": 849 }, { "epoch": 1.3556618819776713, "grad_norm": 0.21209002942660196, "learning_rate": 3.0466903073286052e-05, "loss": 0.3659, "step": 850 }, { "epoch": 1.3572567783094098, "grad_norm": 0.2586978015679334, "learning_rate": 3.0437352245862886e-05, "loss": 0.381, "step": 851 }, { "epoch": 1.3588516746411483, "grad_norm": 0.2163635969682371, "learning_rate": 3.0407801418439717e-05, "loss": 0.3686, "step": 852 }, { "epoch": 1.3604465709728868, "grad_norm": 0.2122153183551515, "learning_rate": 3.0378250591016548e-05, "loss": 0.3571, "step": 853 }, { "epoch": 1.3620414673046253, "grad_norm": 0.28385227164420307, "learning_rate": 3.034869976359338e-05, "loss": 0.382, "step": 854 }, { "epoch": 1.3636363636363638, "grad_norm": 0.24518426468937843, "learning_rate": 3.0319148936170216e-05, "loss": 0.388, "step": 855 }, { "epoch": 1.365231259968102, "grad_norm": 0.2717570755837066, "learning_rate": 3.0289598108747047e-05, "loss": 0.4087, "step": 856 }, { "epoch": 1.3668261562998405, "grad_norm": 0.23519790162792636, "learning_rate": 3.0260047281323877e-05, "loss": 0.3726, "step": 857 }, { "epoch": 1.368421052631579, "grad_norm": 0.23786203052177218, "learning_rate": 3.0230496453900708e-05, "loss": 0.3654, "step": 858 }, { "epoch": 1.3700159489633175, "grad_norm": 0.23417551465362954, "learning_rate": 3.0200945626477545e-05, "loss": 0.3681, "step": 859 }, { "epoch": 1.3716108452950557, "grad_norm": 0.24301025105738364, "learning_rate": 3.0171394799054376e-05, "loss": 0.3706, "step": 860 }, { "epoch": 1.3732057416267942, "grad_norm": 0.20558967559901922, "learning_rate": 3.0141843971631207e-05, "loss": 0.3508, "step": 861 }, { "epoch": 1.3748006379585327, "grad_norm": 0.25663476368659954, "learning_rate": 3.0112293144208037e-05, "loss": 0.381, "step": 862 }, { "epoch": 1.3763955342902712, "grad_norm": 0.2315399614870477, "learning_rate": 3.0082742316784875e-05, "loss": 0.372, "step": 863 }, { "epoch": 1.3779904306220097, "grad_norm": 0.22267643110720403, "learning_rate": 3.0053191489361706e-05, "loss": 0.349, "step": 864 }, { "epoch": 1.3795853269537481, "grad_norm": 0.23034797598031082, "learning_rate": 3.0023640661938536e-05, "loss": 0.3666, "step": 865 }, { "epoch": 1.3811802232854864, "grad_norm": 0.23456245251558872, "learning_rate": 2.9994089834515367e-05, "loss": 0.3819, "step": 866 }, { "epoch": 1.3827751196172249, "grad_norm": 0.22244766817181486, "learning_rate": 2.99645390070922e-05, "loss": 0.3808, "step": 867 }, { "epoch": 1.3843700159489634, "grad_norm": 0.25299353663746865, "learning_rate": 2.993498817966903e-05, "loss": 0.3663, "step": 868 }, { "epoch": 1.3859649122807016, "grad_norm": 0.2554760180487494, "learning_rate": 2.9905437352245862e-05, "loss": 0.3874, "step": 869 }, { "epoch": 1.38755980861244, "grad_norm": 0.2334896251836629, "learning_rate": 2.9875886524822693e-05, "loss": 0.3549, "step": 870 }, { "epoch": 1.3891547049441786, "grad_norm": 0.23450772022169503, "learning_rate": 2.984633569739953e-05, "loss": 0.368, "step": 871 }, { "epoch": 1.390749601275917, "grad_norm": 0.219360964289364, "learning_rate": 2.981678486997636e-05, "loss": 0.3545, "step": 872 }, { "epoch": 1.3923444976076556, "grad_norm": 0.22335716716084147, "learning_rate": 2.9787234042553192e-05, "loss": 0.377, "step": 873 }, { "epoch": 1.393939393939394, "grad_norm": 0.22486597479305415, "learning_rate": 2.975768321513003e-05, "loss": 0.3883, "step": 874 }, { "epoch": 1.3955342902711323, "grad_norm": 0.22480061009477403, "learning_rate": 2.972813238770686e-05, "loss": 0.3549, "step": 875 }, { "epoch": 1.3971291866028708, "grad_norm": 0.23424700480480778, "learning_rate": 2.969858156028369e-05, "loss": 0.3812, "step": 876 }, { "epoch": 1.3987240829346093, "grad_norm": 0.22557512948025574, "learning_rate": 2.966903073286052e-05, "loss": 0.3792, "step": 877 }, { "epoch": 1.4003189792663477, "grad_norm": 0.24115711646217888, "learning_rate": 2.9639479905437355e-05, "loss": 0.3576, "step": 878 }, { "epoch": 1.401913875598086, "grad_norm": 0.2107082064464784, "learning_rate": 2.9609929078014186e-05, "loss": 0.3763, "step": 879 }, { "epoch": 1.4035087719298245, "grad_norm": 0.21097442307055492, "learning_rate": 2.9580378250591017e-05, "loss": 0.3539, "step": 880 }, { "epoch": 1.405103668261563, "grad_norm": 0.20816160970123496, "learning_rate": 2.9550827423167847e-05, "loss": 0.3553, "step": 881 }, { "epoch": 1.4066985645933014, "grad_norm": 0.19186054056440108, "learning_rate": 2.9521276595744685e-05, "loss": 0.3666, "step": 882 }, { "epoch": 1.40829346092504, "grad_norm": 0.21875211623161844, "learning_rate": 2.9491725768321515e-05, "loss": 0.3968, "step": 883 }, { "epoch": 1.4098883572567784, "grad_norm": 0.21905107936738918, "learning_rate": 2.9462174940898346e-05, "loss": 0.3772, "step": 884 }, { "epoch": 1.4114832535885167, "grad_norm": 0.2124497274130083, "learning_rate": 2.9432624113475177e-05, "loss": 0.3697, "step": 885 }, { "epoch": 1.4130781499202552, "grad_norm": 0.20900477934560935, "learning_rate": 2.9403073286052014e-05, "loss": 0.3656, "step": 886 }, { "epoch": 1.4146730462519936, "grad_norm": 0.22450811481660327, "learning_rate": 2.9373522458628845e-05, "loss": 0.3656, "step": 887 }, { "epoch": 1.4162679425837321, "grad_norm": 0.21954284771074475, "learning_rate": 2.9343971631205676e-05, "loss": 0.4018, "step": 888 }, { "epoch": 1.4178628389154704, "grad_norm": 0.20021989485036315, "learning_rate": 2.9314420803782506e-05, "loss": 0.3602, "step": 889 }, { "epoch": 1.4194577352472089, "grad_norm": 0.21755311891204648, "learning_rate": 2.928486997635934e-05, "loss": 0.3883, "step": 890 }, { "epoch": 1.4210526315789473, "grad_norm": 0.20735927538328688, "learning_rate": 2.925531914893617e-05, "loss": 0.3774, "step": 891 }, { "epoch": 1.4226475279106858, "grad_norm": 0.19927127050380705, "learning_rate": 2.9225768321513e-05, "loss": 0.3589, "step": 892 }, { "epoch": 1.4242424242424243, "grad_norm": 0.23942263112093864, "learning_rate": 2.9196217494089832e-05, "loss": 0.3757, "step": 893 }, { "epoch": 1.4258373205741628, "grad_norm": 0.22621609076315397, "learning_rate": 2.916666666666667e-05, "loss": 0.3859, "step": 894 }, { "epoch": 1.427432216905901, "grad_norm": 0.23566292922656845, "learning_rate": 2.91371158392435e-05, "loss": 0.3681, "step": 895 }, { "epoch": 1.4290271132376395, "grad_norm": 0.23920370323215645, "learning_rate": 2.910756501182033e-05, "loss": 0.3676, "step": 896 }, { "epoch": 1.430622009569378, "grad_norm": 0.21793305720118122, "learning_rate": 2.9078014184397162e-05, "loss": 0.3745, "step": 897 }, { "epoch": 1.4322169059011165, "grad_norm": 0.2830422862007345, "learning_rate": 2.9048463356974e-05, "loss": 0.3706, "step": 898 }, { "epoch": 1.4338118022328548, "grad_norm": 0.2311410321442392, "learning_rate": 2.901891252955083e-05, "loss": 0.3763, "step": 899 }, { "epoch": 1.4354066985645932, "grad_norm": 0.23746288975904176, "learning_rate": 2.898936170212766e-05, "loss": 0.3594, "step": 900 }, { "epoch": 1.4370015948963317, "grad_norm": 0.2326404184471103, "learning_rate": 2.895981087470449e-05, "loss": 0.3687, "step": 901 }, { "epoch": 1.4385964912280702, "grad_norm": 0.21980277012362007, "learning_rate": 2.8930260047281325e-05, "loss": 0.3858, "step": 902 }, { "epoch": 1.4401913875598087, "grad_norm": 0.24081776233382735, "learning_rate": 2.8900709219858156e-05, "loss": 0.3736, "step": 903 }, { "epoch": 1.4417862838915472, "grad_norm": 0.24878578490855063, "learning_rate": 2.8871158392434987e-05, "loss": 0.3964, "step": 904 }, { "epoch": 1.4433811802232854, "grad_norm": 0.22101447011934336, "learning_rate": 2.8841607565011824e-05, "loss": 0.3717, "step": 905 }, { "epoch": 1.444976076555024, "grad_norm": 0.24740732250958888, "learning_rate": 2.8812056737588655e-05, "loss": 0.3756, "step": 906 }, { "epoch": 1.4465709728867624, "grad_norm": 0.24545683027304022, "learning_rate": 2.8782505910165485e-05, "loss": 0.3776, "step": 907 }, { "epoch": 1.4481658692185009, "grad_norm": 0.2022008092476536, "learning_rate": 2.8752955082742316e-05, "loss": 0.3499, "step": 908 }, { "epoch": 1.4497607655502391, "grad_norm": 0.25445781336989076, "learning_rate": 2.8723404255319154e-05, "loss": 0.3793, "step": 909 }, { "epoch": 1.4513556618819776, "grad_norm": 0.21375308263105722, "learning_rate": 2.8693853427895984e-05, "loss": 0.3643, "step": 910 }, { "epoch": 1.452950558213716, "grad_norm": 0.2636339219571946, "learning_rate": 2.8664302600472815e-05, "loss": 0.3639, "step": 911 }, { "epoch": 1.4545454545454546, "grad_norm": 0.23584499761429634, "learning_rate": 2.8634751773049646e-05, "loss": 0.359, "step": 912 }, { "epoch": 1.456140350877193, "grad_norm": 0.24579341390114595, "learning_rate": 2.8605200945626483e-05, "loss": 0.3542, "step": 913 }, { "epoch": 1.4577352472089316, "grad_norm": 0.23380505080896416, "learning_rate": 2.8575650118203314e-05, "loss": 0.3698, "step": 914 }, { "epoch": 1.4593301435406698, "grad_norm": 0.2118574600355282, "learning_rate": 2.8546099290780144e-05, "loss": 0.3705, "step": 915 }, { "epoch": 1.4609250398724083, "grad_norm": 0.24536715319576624, "learning_rate": 2.8516548463356975e-05, "loss": 0.3882, "step": 916 }, { "epoch": 1.4625199362041468, "grad_norm": 0.21981713778283885, "learning_rate": 2.848699763593381e-05, "loss": 0.3885, "step": 917 }, { "epoch": 1.464114832535885, "grad_norm": 0.2182713153840085, "learning_rate": 2.845744680851064e-05, "loss": 0.3667, "step": 918 }, { "epoch": 1.4657097288676235, "grad_norm": 0.21169577402325512, "learning_rate": 2.842789598108747e-05, "loss": 0.3612, "step": 919 }, { "epoch": 1.467304625199362, "grad_norm": 0.24411935473191396, "learning_rate": 2.83983451536643e-05, "loss": 0.3665, "step": 920 }, { "epoch": 1.4688995215311005, "grad_norm": 0.2579332485292547, "learning_rate": 2.836879432624114e-05, "loss": 0.374, "step": 921 }, { "epoch": 1.470494417862839, "grad_norm": 0.2345638070589613, "learning_rate": 2.833924349881797e-05, "loss": 0.3665, "step": 922 }, { "epoch": 1.4720893141945774, "grad_norm": 0.2332960488447943, "learning_rate": 2.83096926713948e-05, "loss": 0.3568, "step": 923 }, { "epoch": 1.4736842105263157, "grad_norm": 0.23452050543621616, "learning_rate": 2.828014184397163e-05, "loss": 0.3568, "step": 924 }, { "epoch": 1.4752791068580542, "grad_norm": 0.22216111232106314, "learning_rate": 2.8250591016548468e-05, "loss": 0.3682, "step": 925 }, { "epoch": 1.4768740031897927, "grad_norm": 0.23460707400650643, "learning_rate": 2.82210401891253e-05, "loss": 0.3703, "step": 926 }, { "epoch": 1.4784688995215312, "grad_norm": 0.24494908003467314, "learning_rate": 2.819148936170213e-05, "loss": 0.3632, "step": 927 }, { "epoch": 1.4800637958532694, "grad_norm": 0.1986116220380296, "learning_rate": 2.816193853427896e-05, "loss": 0.3588, "step": 928 }, { "epoch": 1.481658692185008, "grad_norm": 0.2328423892822764, "learning_rate": 2.8132387706855794e-05, "loss": 0.3639, "step": 929 }, { "epoch": 1.4832535885167464, "grad_norm": 0.2365708049039806, "learning_rate": 2.8102836879432625e-05, "loss": 0.3588, "step": 930 }, { "epoch": 1.4848484848484849, "grad_norm": 0.19911927305716656, "learning_rate": 2.8073286052009455e-05, "loss": 0.3758, "step": 931 }, { "epoch": 1.4864433811802233, "grad_norm": 0.2056769799572001, "learning_rate": 2.8043735224586286e-05, "loss": 0.3711, "step": 932 }, { "epoch": 1.4880382775119618, "grad_norm": 0.2121094733624876, "learning_rate": 2.8014184397163124e-05, "loss": 0.366, "step": 933 }, { "epoch": 1.4896331738437, "grad_norm": 0.2226719090112622, "learning_rate": 2.7984633569739954e-05, "loss": 0.3543, "step": 934 }, { "epoch": 1.4912280701754386, "grad_norm": 0.20765994988678865, "learning_rate": 2.7955082742316785e-05, "loss": 0.3751, "step": 935 }, { "epoch": 1.492822966507177, "grad_norm": 0.2178533443388553, "learning_rate": 2.7925531914893616e-05, "loss": 0.3686, "step": 936 }, { "epoch": 1.4944178628389155, "grad_norm": 0.23174613628779323, "learning_rate": 2.7895981087470453e-05, "loss": 0.3684, "step": 937 }, { "epoch": 1.4960127591706538, "grad_norm": 0.21203905265802545, "learning_rate": 2.7866430260047284e-05, "loss": 0.3519, "step": 938 }, { "epoch": 1.4976076555023923, "grad_norm": 0.20580812878529314, "learning_rate": 2.7836879432624114e-05, "loss": 0.3687, "step": 939 }, { "epoch": 1.4992025518341308, "grad_norm": 0.21596134589211777, "learning_rate": 2.780732860520095e-05, "loss": 0.37, "step": 940 }, { "epoch": 1.5007974481658692, "grad_norm": 0.2260626625688235, "learning_rate": 2.777777777777778e-05, "loss": 0.3783, "step": 941 }, { "epoch": 1.5023923444976077, "grad_norm": 0.26242688879024134, "learning_rate": 2.774822695035461e-05, "loss": 0.3673, "step": 942 }, { "epoch": 1.5039872408293462, "grad_norm": 0.22021159076634833, "learning_rate": 2.771867612293144e-05, "loss": 0.3872, "step": 943 }, { "epoch": 1.5055821371610847, "grad_norm": 0.24209678699743306, "learning_rate": 2.7689125295508278e-05, "loss": 0.377, "step": 944 }, { "epoch": 1.507177033492823, "grad_norm": 0.22833533419877963, "learning_rate": 2.765957446808511e-05, "loss": 0.3648, "step": 945 }, { "epoch": 1.5087719298245614, "grad_norm": 0.22516811581761464, "learning_rate": 2.763002364066194e-05, "loss": 0.3967, "step": 946 }, { "epoch": 1.5103668261562997, "grad_norm": 0.2115481828521072, "learning_rate": 2.760047281323877e-05, "loss": 0.3492, "step": 947 }, { "epoch": 1.5119617224880382, "grad_norm": 0.23869063836666324, "learning_rate": 2.7570921985815607e-05, "loss": 0.381, "step": 948 }, { "epoch": 1.5135566188197767, "grad_norm": 0.23070921341706127, "learning_rate": 2.7541371158392438e-05, "loss": 0.3788, "step": 949 }, { "epoch": 1.5151515151515151, "grad_norm": 0.217022119155353, "learning_rate": 2.751182033096927e-05, "loss": 0.3593, "step": 950 }, { "epoch": 1.5167464114832536, "grad_norm": 0.21374654492567197, "learning_rate": 2.74822695035461e-05, "loss": 0.3672, "step": 951 }, { "epoch": 1.518341307814992, "grad_norm": 0.23313319311246797, "learning_rate": 2.7452718676122933e-05, "loss": 0.3746, "step": 952 }, { "epoch": 1.5199362041467306, "grad_norm": 0.20572807250712646, "learning_rate": 2.7423167848699764e-05, "loss": 0.3551, "step": 953 }, { "epoch": 1.5215311004784688, "grad_norm": 0.1997805318126903, "learning_rate": 2.7393617021276595e-05, "loss": 0.3432, "step": 954 }, { "epoch": 1.5231259968102073, "grad_norm": 0.26392919068089526, "learning_rate": 2.7364066193853425e-05, "loss": 0.3725, "step": 955 }, { "epoch": 1.5247208931419458, "grad_norm": 0.244962842483798, "learning_rate": 2.7334515366430263e-05, "loss": 0.3987, "step": 956 }, { "epoch": 1.526315789473684, "grad_norm": 0.23739990951852485, "learning_rate": 2.7304964539007094e-05, "loss": 0.3544, "step": 957 }, { "epoch": 1.5279106858054226, "grad_norm": 0.23274957656389275, "learning_rate": 2.7275413711583924e-05, "loss": 0.3733, "step": 958 }, { "epoch": 1.529505582137161, "grad_norm": 0.24800091621944534, "learning_rate": 2.7245862884160755e-05, "loss": 0.366, "step": 959 }, { "epoch": 1.5311004784688995, "grad_norm": 0.22394308843105284, "learning_rate": 2.7216312056737592e-05, "loss": 0.3617, "step": 960 }, { "epoch": 1.532695374800638, "grad_norm": 0.22830700014212593, "learning_rate": 2.7186761229314423e-05, "loss": 0.3732, "step": 961 }, { "epoch": 1.5342902711323765, "grad_norm": 0.2017547638154948, "learning_rate": 2.7157210401891254e-05, "loss": 0.3502, "step": 962 }, { "epoch": 1.535885167464115, "grad_norm": 0.2199542908785551, "learning_rate": 2.7127659574468084e-05, "loss": 0.3744, "step": 963 }, { "epoch": 1.5374800637958532, "grad_norm": 0.22925618470081205, "learning_rate": 2.7098108747044922e-05, "loss": 0.3431, "step": 964 }, { "epoch": 1.5390749601275917, "grad_norm": 0.21137539140900416, "learning_rate": 2.7068557919621753e-05, "loss": 0.3702, "step": 965 }, { "epoch": 1.5406698564593302, "grad_norm": 0.2418928435992513, "learning_rate": 2.7039007092198583e-05, "loss": 0.35, "step": 966 }, { "epoch": 1.5422647527910684, "grad_norm": 0.21830672338161386, "learning_rate": 2.7009456264775414e-05, "loss": 0.3503, "step": 967 }, { "epoch": 1.543859649122807, "grad_norm": 0.21844147775782205, "learning_rate": 2.6979905437352248e-05, "loss": 0.3655, "step": 968 }, { "epoch": 1.5454545454545454, "grad_norm": 0.23611475273808233, "learning_rate": 2.695035460992908e-05, "loss": 0.3761, "step": 969 }, { "epoch": 1.547049441786284, "grad_norm": 0.2249095392300342, "learning_rate": 2.692080378250591e-05, "loss": 0.3735, "step": 970 }, { "epoch": 1.5486443381180224, "grad_norm": 0.21613140243361573, "learning_rate": 2.6891252955082747e-05, "loss": 0.3955, "step": 971 }, { "epoch": 1.5502392344497609, "grad_norm": 0.24622668005782347, "learning_rate": 2.6861702127659577e-05, "loss": 0.3721, "step": 972 }, { "epoch": 1.5518341307814993, "grad_norm": 0.20577363475959934, "learning_rate": 2.6832151300236408e-05, "loss": 0.3639, "step": 973 }, { "epoch": 1.5534290271132376, "grad_norm": 0.22219903441818162, "learning_rate": 2.680260047281324e-05, "loss": 0.3493, "step": 974 }, { "epoch": 1.555023923444976, "grad_norm": 0.2733255486015304, "learning_rate": 2.6773049645390076e-05, "loss": 0.3923, "step": 975 }, { "epoch": 1.5566188197767146, "grad_norm": 0.1952243366312519, "learning_rate": 2.6743498817966907e-05, "loss": 0.3672, "step": 976 }, { "epoch": 1.5582137161084528, "grad_norm": 0.22656234204256634, "learning_rate": 2.6713947990543738e-05, "loss": 0.3608, "step": 977 }, { "epoch": 1.5598086124401913, "grad_norm": 0.20831402676135102, "learning_rate": 2.6684397163120568e-05, "loss": 0.3426, "step": 978 }, { "epoch": 1.5614035087719298, "grad_norm": 0.20201160546310964, "learning_rate": 2.6654846335697402e-05, "loss": 0.3449, "step": 979 }, { "epoch": 1.5629984051036683, "grad_norm": 0.22015870310560104, "learning_rate": 2.6625295508274233e-05, "loss": 0.3751, "step": 980 }, { "epoch": 1.5645933014354068, "grad_norm": 0.2143350137663692, "learning_rate": 2.6595744680851064e-05, "loss": 0.3666, "step": 981 }, { "epoch": 1.5661881977671452, "grad_norm": 0.20683828402748242, "learning_rate": 2.6566193853427894e-05, "loss": 0.3636, "step": 982 }, { "epoch": 1.5677830940988837, "grad_norm": 0.210446563436305, "learning_rate": 2.6536643026004732e-05, "loss": 0.3547, "step": 983 }, { "epoch": 1.569377990430622, "grad_norm": 0.2080374743800072, "learning_rate": 2.6507092198581562e-05, "loss": 0.3569, "step": 984 }, { "epoch": 1.5709728867623605, "grad_norm": 0.22824539151256767, "learning_rate": 2.6477541371158393e-05, "loss": 0.3721, "step": 985 }, { "epoch": 1.5725677830940987, "grad_norm": 0.1968578002355595, "learning_rate": 2.6447990543735224e-05, "loss": 0.3636, "step": 986 }, { "epoch": 1.5741626794258372, "grad_norm": 0.21860601654787826, "learning_rate": 2.641843971631206e-05, "loss": 0.3723, "step": 987 }, { "epoch": 1.5757575757575757, "grad_norm": 0.22246193373943018, "learning_rate": 2.6388888888888892e-05, "loss": 0.3693, "step": 988 }, { "epoch": 1.5773524720893142, "grad_norm": 0.2099023168749042, "learning_rate": 2.6359338061465723e-05, "loss": 0.3607, "step": 989 }, { "epoch": 1.5789473684210527, "grad_norm": 0.23679716665691483, "learning_rate": 2.6329787234042553e-05, "loss": 0.3801, "step": 990 }, { "epoch": 1.5805422647527911, "grad_norm": 0.21215001480369775, "learning_rate": 2.6300236406619387e-05, "loss": 0.3548, "step": 991 }, { "epoch": 1.5821371610845296, "grad_norm": 0.23513065116583215, "learning_rate": 2.6270685579196218e-05, "loss": 0.3821, "step": 992 }, { "epoch": 1.583732057416268, "grad_norm": 0.1979795640249849, "learning_rate": 2.624113475177305e-05, "loss": 0.3571, "step": 993 }, { "epoch": 1.5853269537480064, "grad_norm": 0.2102361117998075, "learning_rate": 2.621158392434988e-05, "loss": 0.3607, "step": 994 }, { "epoch": 1.5869218500797448, "grad_norm": 0.19275886346893673, "learning_rate": 2.6182033096926717e-05, "loss": 0.3668, "step": 995 }, { "epoch": 1.588516746411483, "grad_norm": 0.21194526280602768, "learning_rate": 2.6152482269503547e-05, "loss": 0.3694, "step": 996 }, { "epoch": 1.5901116427432216, "grad_norm": 0.21010717155988434, "learning_rate": 2.6122931442080378e-05, "loss": 0.3848, "step": 997 }, { "epoch": 1.59170653907496, "grad_norm": 0.21827090192042423, "learning_rate": 2.609338061465721e-05, "loss": 0.3745, "step": 998 }, { "epoch": 1.5933014354066986, "grad_norm": 0.2112728701540889, "learning_rate": 2.6063829787234046e-05, "loss": 0.3619, "step": 999 }, { "epoch": 1.594896331738437, "grad_norm": 0.19961291179401464, "learning_rate": 2.6034278959810877e-05, "loss": 0.3673, "step": 1000 }, { "epoch": 1.5964912280701755, "grad_norm": 0.22865330762410388, "learning_rate": 2.6004728132387708e-05, "loss": 0.3596, "step": 1001 }, { "epoch": 1.598086124401914, "grad_norm": 0.23538393146310016, "learning_rate": 2.5975177304964538e-05, "loss": 0.3727, "step": 1002 }, { "epoch": 1.5996810207336523, "grad_norm": 0.2123494160546059, "learning_rate": 2.5945626477541372e-05, "loss": 0.3712, "step": 1003 }, { "epoch": 1.6012759170653907, "grad_norm": 0.23858557957775348, "learning_rate": 2.5916075650118203e-05, "loss": 0.3557, "step": 1004 }, { "epoch": 1.6028708133971292, "grad_norm": 0.22848137436810237, "learning_rate": 2.5886524822695034e-05, "loss": 0.36, "step": 1005 }, { "epoch": 1.6044657097288675, "grad_norm": 0.2133895815994295, "learning_rate": 2.585697399527187e-05, "loss": 0.3704, "step": 1006 }, { "epoch": 1.606060606060606, "grad_norm": 0.21372595983846324, "learning_rate": 2.5827423167848702e-05, "loss": 0.3401, "step": 1007 }, { "epoch": 1.6076555023923444, "grad_norm": 0.2276115343771436, "learning_rate": 2.5797872340425532e-05, "loss": 0.3757, "step": 1008 }, { "epoch": 1.609250398724083, "grad_norm": 0.22043136106178238, "learning_rate": 2.5768321513002363e-05, "loss": 0.3642, "step": 1009 }, { "epoch": 1.6108452950558214, "grad_norm": 0.20893966942405567, "learning_rate": 2.57387706855792e-05, "loss": 0.3662, "step": 1010 }, { "epoch": 1.61244019138756, "grad_norm": 0.2142181243989403, "learning_rate": 2.570921985815603e-05, "loss": 0.3599, "step": 1011 }, { "epoch": 1.6140350877192984, "grad_norm": 0.2144420710855822, "learning_rate": 2.5679669030732862e-05, "loss": 0.3731, "step": 1012 }, { "epoch": 1.6156299840510366, "grad_norm": 0.24342619833595588, "learning_rate": 2.5650118203309693e-05, "loss": 0.3867, "step": 1013 }, { "epoch": 1.6172248803827751, "grad_norm": 0.19845755457082936, "learning_rate": 2.562056737588653e-05, "loss": 0.375, "step": 1014 }, { "epoch": 1.6188197767145136, "grad_norm": 0.2224431763601478, "learning_rate": 2.559101654846336e-05, "loss": 0.3832, "step": 1015 }, { "epoch": 1.6204146730462519, "grad_norm": 0.19980307408299733, "learning_rate": 2.556146572104019e-05, "loss": 0.3689, "step": 1016 }, { "epoch": 1.6220095693779903, "grad_norm": 0.21916525852490967, "learning_rate": 2.5531914893617022e-05, "loss": 0.3461, "step": 1017 }, { "epoch": 1.6236044657097288, "grad_norm": 0.19434621621765755, "learning_rate": 2.5502364066193856e-05, "loss": 0.3796, "step": 1018 }, { "epoch": 1.6251993620414673, "grad_norm": 0.2169065735969911, "learning_rate": 2.5472813238770687e-05, "loss": 0.3655, "step": 1019 }, { "epoch": 1.6267942583732058, "grad_norm": 0.20460794511029398, "learning_rate": 2.5443262411347517e-05, "loss": 0.3662, "step": 1020 }, { "epoch": 1.6283891547049443, "grad_norm": 0.21488181172920356, "learning_rate": 2.5413711583924348e-05, "loss": 0.3721, "step": 1021 }, { "epoch": 1.6299840510366828, "grad_norm": 0.21247298194689926, "learning_rate": 2.5384160756501186e-05, "loss": 0.3751, "step": 1022 }, { "epoch": 1.631578947368421, "grad_norm": 0.23567827842618905, "learning_rate": 2.5354609929078016e-05, "loss": 0.3703, "step": 1023 }, { "epoch": 1.6331738437001595, "grad_norm": 0.23211287435179437, "learning_rate": 2.5325059101654847e-05, "loss": 0.3728, "step": 1024 }, { "epoch": 1.6347687400318978, "grad_norm": 0.2512357038147536, "learning_rate": 2.5295508274231678e-05, "loss": 0.3405, "step": 1025 }, { "epoch": 1.6363636363636362, "grad_norm": 0.21047536463129124, "learning_rate": 2.5265957446808515e-05, "loss": 0.3667, "step": 1026 }, { "epoch": 1.6379585326953747, "grad_norm": 0.2565156656199752, "learning_rate": 2.5236406619385346e-05, "loss": 0.393, "step": 1027 }, { "epoch": 1.6395534290271132, "grad_norm": 0.2394194530994183, "learning_rate": 2.5206855791962176e-05, "loss": 0.3779, "step": 1028 }, { "epoch": 1.6411483253588517, "grad_norm": 0.24972756595602325, "learning_rate": 2.5177304964539007e-05, "loss": 0.3586, "step": 1029 }, { "epoch": 1.6427432216905902, "grad_norm": 0.22397652576148236, "learning_rate": 2.514775413711584e-05, "loss": 0.3592, "step": 1030 }, { "epoch": 1.6443381180223287, "grad_norm": 0.23881357754175625, "learning_rate": 2.5118203309692672e-05, "loss": 0.3532, "step": 1031 }, { "epoch": 1.6459330143540671, "grad_norm": 0.23545236402140263, "learning_rate": 2.5088652482269502e-05, "loss": 0.3538, "step": 1032 }, { "epoch": 1.6475279106858054, "grad_norm": 0.2207318873450374, "learning_rate": 2.5059101654846333e-05, "loss": 0.3498, "step": 1033 }, { "epoch": 1.6491228070175439, "grad_norm": 0.21577479022294224, "learning_rate": 2.502955082742317e-05, "loss": 0.3814, "step": 1034 }, { "epoch": 1.6507177033492821, "grad_norm": 0.21922295825192414, "learning_rate": 2.5e-05, "loss": 0.3581, "step": 1035 }, { "epoch": 1.6523125996810206, "grad_norm": 0.24525163550151927, "learning_rate": 2.4970449172576835e-05, "loss": 0.3642, "step": 1036 }, { "epoch": 1.653907496012759, "grad_norm": 0.20534065293112683, "learning_rate": 2.4940898345153666e-05, "loss": 0.3666, "step": 1037 }, { "epoch": 1.6555023923444976, "grad_norm": 0.2274988306561708, "learning_rate": 2.49113475177305e-05, "loss": 0.3519, "step": 1038 }, { "epoch": 1.657097288676236, "grad_norm": 0.21640902119956973, "learning_rate": 2.488179669030733e-05, "loss": 0.3698, "step": 1039 }, { "epoch": 1.6586921850079746, "grad_norm": 0.1965494136042385, "learning_rate": 2.4852245862884165e-05, "loss": 0.3723, "step": 1040 }, { "epoch": 1.660287081339713, "grad_norm": 0.20925669996472832, "learning_rate": 2.4822695035460995e-05, "loss": 0.3651, "step": 1041 }, { "epoch": 1.6618819776714515, "grad_norm": 0.23061850494772626, "learning_rate": 2.4793144208037826e-05, "loss": 0.3684, "step": 1042 }, { "epoch": 1.6634768740031898, "grad_norm": 0.2083183150064671, "learning_rate": 2.4763593380614657e-05, "loss": 0.3735, "step": 1043 }, { "epoch": 1.6650717703349283, "grad_norm": 0.23200754690065872, "learning_rate": 2.473404255319149e-05, "loss": 0.3592, "step": 1044 }, { "epoch": 1.6666666666666665, "grad_norm": 0.20403112880510288, "learning_rate": 2.470449172576832e-05, "loss": 0.3604, "step": 1045 }, { "epoch": 1.668261562998405, "grad_norm": 0.2208296674773137, "learning_rate": 2.4674940898345156e-05, "loss": 0.3698, "step": 1046 }, { "epoch": 1.6698564593301435, "grad_norm": 0.219841628075051, "learning_rate": 2.4645390070921986e-05, "loss": 0.3564, "step": 1047 }, { "epoch": 1.671451355661882, "grad_norm": 0.20979788372181146, "learning_rate": 2.461583924349882e-05, "loss": 0.3609, "step": 1048 }, { "epoch": 1.6730462519936204, "grad_norm": 0.2343946008576476, "learning_rate": 2.458628841607565e-05, "loss": 0.3811, "step": 1049 }, { "epoch": 1.674641148325359, "grad_norm": 0.21052733755468306, "learning_rate": 2.4556737588652485e-05, "loss": 0.3523, "step": 1050 }, { "epoch": 1.6762360446570974, "grad_norm": 0.21969032259373597, "learning_rate": 2.4527186761229316e-05, "loss": 0.3739, "step": 1051 }, { "epoch": 1.6778309409888357, "grad_norm": 0.22805376426616225, "learning_rate": 2.449763593380615e-05, "loss": 0.3594, "step": 1052 }, { "epoch": 1.6794258373205742, "grad_norm": 0.22768713364628984, "learning_rate": 2.446808510638298e-05, "loss": 0.3558, "step": 1053 }, { "epoch": 1.6810207336523126, "grad_norm": 0.22151630757975682, "learning_rate": 2.443853427895981e-05, "loss": 0.3636, "step": 1054 }, { "epoch": 1.682615629984051, "grad_norm": 0.2215752835797092, "learning_rate": 2.4408983451536642e-05, "loss": 0.3596, "step": 1055 }, { "epoch": 1.6842105263157894, "grad_norm": 0.24645698987853698, "learning_rate": 2.4379432624113476e-05, "loss": 0.3911, "step": 1056 }, { "epoch": 1.6858054226475279, "grad_norm": 0.25430622452358026, "learning_rate": 2.4349881796690306e-05, "loss": 0.3717, "step": 1057 }, { "epoch": 1.6874003189792663, "grad_norm": 0.20625616783026457, "learning_rate": 2.432033096926714e-05, "loss": 0.364, "step": 1058 }, { "epoch": 1.6889952153110048, "grad_norm": 0.21474601525652337, "learning_rate": 2.429078014184397e-05, "loss": 0.3592, "step": 1059 }, { "epoch": 1.6905901116427433, "grad_norm": 0.2501378072588939, "learning_rate": 2.4261229314420805e-05, "loss": 0.3751, "step": 1060 }, { "epoch": 1.6921850079744818, "grad_norm": 0.20542912753986298, "learning_rate": 2.4231678486997636e-05, "loss": 0.3602, "step": 1061 }, { "epoch": 1.69377990430622, "grad_norm": 0.23833197411140378, "learning_rate": 2.420212765957447e-05, "loss": 0.3594, "step": 1062 }, { "epoch": 1.6953748006379585, "grad_norm": 0.2505111652415351, "learning_rate": 2.41725768321513e-05, "loss": 0.3596, "step": 1063 }, { "epoch": 1.696969696969697, "grad_norm": 0.20390302079477488, "learning_rate": 2.4143026004728135e-05, "loss": 0.351, "step": 1064 }, { "epoch": 1.6985645933014353, "grad_norm": 0.2306794877885382, "learning_rate": 2.4113475177304965e-05, "loss": 0.3537, "step": 1065 }, { "epoch": 1.7001594896331738, "grad_norm": 0.24151203944066563, "learning_rate": 2.40839243498818e-05, "loss": 0.3555, "step": 1066 }, { "epoch": 1.7017543859649122, "grad_norm": 0.2278190452569431, "learning_rate": 2.405437352245863e-05, "loss": 0.3649, "step": 1067 }, { "epoch": 1.7033492822966507, "grad_norm": 0.2143752734750697, "learning_rate": 2.402482269503546e-05, "loss": 0.3687, "step": 1068 }, { "epoch": 1.7049441786283892, "grad_norm": 0.23324667058516482, "learning_rate": 2.3995271867612295e-05, "loss": 0.3629, "step": 1069 }, { "epoch": 1.7065390749601277, "grad_norm": 0.2151438181052852, "learning_rate": 2.3965721040189126e-05, "loss": 0.3458, "step": 1070 }, { "epoch": 1.7081339712918662, "grad_norm": 0.20508138099525114, "learning_rate": 2.393617021276596e-05, "loss": 0.3558, "step": 1071 }, { "epoch": 1.7097288676236044, "grad_norm": 0.1966750079788111, "learning_rate": 2.390661938534279e-05, "loss": 0.3473, "step": 1072 }, { "epoch": 1.711323763955343, "grad_norm": 0.23661313063708325, "learning_rate": 2.3877068557919624e-05, "loss": 0.4028, "step": 1073 }, { "epoch": 1.7129186602870812, "grad_norm": 0.18748225912217392, "learning_rate": 2.3847517730496455e-05, "loss": 0.3614, "step": 1074 }, { "epoch": 1.7145135566188197, "grad_norm": 0.2139032070188827, "learning_rate": 2.381796690307329e-05, "loss": 0.3464, "step": 1075 }, { "epoch": 1.7161084529505581, "grad_norm": 0.21664775906205216, "learning_rate": 2.378841607565012e-05, "loss": 0.3761, "step": 1076 }, { "epoch": 1.7177033492822966, "grad_norm": 0.21241376109241308, "learning_rate": 2.3758865248226954e-05, "loss": 0.3746, "step": 1077 }, { "epoch": 1.719298245614035, "grad_norm": 0.2099880297242477, "learning_rate": 2.3729314420803784e-05, "loss": 0.3857, "step": 1078 }, { "epoch": 1.7208931419457736, "grad_norm": 0.19372599218115852, "learning_rate": 2.3699763593380615e-05, "loss": 0.3618, "step": 1079 }, { "epoch": 1.722488038277512, "grad_norm": 0.21047068540832495, "learning_rate": 2.3670212765957446e-05, "loss": 0.3776, "step": 1080 }, { "epoch": 1.7240829346092506, "grad_norm": 0.21639802938656819, "learning_rate": 2.364066193853428e-05, "loss": 0.3751, "step": 1081 }, { "epoch": 1.7256778309409888, "grad_norm": 0.2334113949689573, "learning_rate": 2.361111111111111e-05, "loss": 0.3729, "step": 1082 }, { "epoch": 1.7272727272727273, "grad_norm": 0.20730081271186543, "learning_rate": 2.3581560283687945e-05, "loss": 0.3562, "step": 1083 }, { "epoch": 1.7288676236044656, "grad_norm": 0.19244571786195055, "learning_rate": 2.3552009456264775e-05, "loss": 0.3477, "step": 1084 }, { "epoch": 1.730462519936204, "grad_norm": 0.18927238204330252, "learning_rate": 2.352245862884161e-05, "loss": 0.358, "step": 1085 }, { "epoch": 1.7320574162679425, "grad_norm": 0.18029016867721917, "learning_rate": 2.349290780141844e-05, "loss": 0.3595, "step": 1086 }, { "epoch": 1.733652312599681, "grad_norm": 0.19585772780031327, "learning_rate": 2.3463356973995274e-05, "loss": 0.3476, "step": 1087 }, { "epoch": 1.7352472089314195, "grad_norm": 0.2019920332438816, "learning_rate": 2.3433806146572105e-05, "loss": 0.3583, "step": 1088 }, { "epoch": 1.736842105263158, "grad_norm": 0.19479728329542398, "learning_rate": 2.340425531914894e-05, "loss": 0.3524, "step": 1089 }, { "epoch": 1.7384370015948964, "grad_norm": 0.2021773618341524, "learning_rate": 2.337470449172577e-05, "loss": 0.3654, "step": 1090 }, { "epoch": 1.740031897926635, "grad_norm": 0.19614077962354198, "learning_rate": 2.3345153664302604e-05, "loss": 0.3586, "step": 1091 }, { "epoch": 1.7416267942583732, "grad_norm": 0.2570960840155657, "learning_rate": 2.3315602836879434e-05, "loss": 0.3636, "step": 1092 }, { "epoch": 1.7432216905901117, "grad_norm": 0.20281357247489218, "learning_rate": 2.3286052009456265e-05, "loss": 0.3651, "step": 1093 }, { "epoch": 1.74481658692185, "grad_norm": 0.22620348480383348, "learning_rate": 2.3256501182033096e-05, "loss": 0.3684, "step": 1094 }, { "epoch": 1.7464114832535884, "grad_norm": 0.20579921924161573, "learning_rate": 2.322695035460993e-05, "loss": 0.3654, "step": 1095 }, { "epoch": 1.748006379585327, "grad_norm": 0.2227189192635118, "learning_rate": 2.319739952718676e-05, "loss": 0.3644, "step": 1096 }, { "epoch": 1.7496012759170654, "grad_norm": 0.19227928591569257, "learning_rate": 2.3167848699763594e-05, "loss": 0.3505, "step": 1097 }, { "epoch": 1.7511961722488039, "grad_norm": 0.20961373720141127, "learning_rate": 2.3138297872340425e-05, "loss": 0.3282, "step": 1098 }, { "epoch": 1.7527910685805423, "grad_norm": 0.21063768153732731, "learning_rate": 2.310874704491726e-05, "loss": 0.3667, "step": 1099 }, { "epoch": 1.7543859649122808, "grad_norm": 0.2071998836644897, "learning_rate": 2.307919621749409e-05, "loss": 0.3616, "step": 1100 }, { "epoch": 1.755980861244019, "grad_norm": 0.1995518189157131, "learning_rate": 2.3049645390070924e-05, "loss": 0.3545, "step": 1101 }, { "epoch": 1.7575757575757576, "grad_norm": 0.19335250730242237, "learning_rate": 2.3020094562647758e-05, "loss": 0.3729, "step": 1102 }, { "epoch": 1.759170653907496, "grad_norm": 0.21182915642766367, "learning_rate": 2.299054373522459e-05, "loss": 0.3808, "step": 1103 }, { "epoch": 1.7607655502392343, "grad_norm": 0.20262308803415194, "learning_rate": 2.296099290780142e-05, "loss": 0.3635, "step": 1104 }, { "epoch": 1.7623604465709728, "grad_norm": 0.22655090994786029, "learning_rate": 2.293144208037825e-05, "loss": 0.3715, "step": 1105 }, { "epoch": 1.7639553429027113, "grad_norm": 0.2124372242582412, "learning_rate": 2.2901891252955084e-05, "loss": 0.3585, "step": 1106 }, { "epoch": 1.7655502392344498, "grad_norm": 0.20889508760129594, "learning_rate": 2.2872340425531915e-05, "loss": 0.364, "step": 1107 }, { "epoch": 1.7671451355661882, "grad_norm": 0.2301142215030826, "learning_rate": 2.284278959810875e-05, "loss": 0.3484, "step": 1108 }, { "epoch": 1.7687400318979267, "grad_norm": 0.1921558272674491, "learning_rate": 2.281323877068558e-05, "loss": 0.3459, "step": 1109 }, { "epoch": 1.7703349282296652, "grad_norm": 0.2314125615321567, "learning_rate": 2.2783687943262413e-05, "loss": 0.3839, "step": 1110 }, { "epoch": 1.7719298245614035, "grad_norm": 0.21817280169325118, "learning_rate": 2.2754137115839244e-05, "loss": 0.3438, "step": 1111 }, { "epoch": 1.773524720893142, "grad_norm": 0.19479616209273914, "learning_rate": 2.2724586288416078e-05, "loss": 0.3723, "step": 1112 }, { "epoch": 1.7751196172248804, "grad_norm": 0.21977887695249768, "learning_rate": 2.269503546099291e-05, "loss": 0.3543, "step": 1113 }, { "epoch": 1.7767145135566187, "grad_norm": 0.20832344817675735, "learning_rate": 2.2665484633569743e-05, "loss": 0.3407, "step": 1114 }, { "epoch": 1.7783094098883572, "grad_norm": 0.20239275299325613, "learning_rate": 2.2635933806146574e-05, "loss": 0.3673, "step": 1115 }, { "epoch": 1.7799043062200957, "grad_norm": 0.23629726124809497, "learning_rate": 2.2606382978723408e-05, "loss": 0.3879, "step": 1116 }, { "epoch": 1.7814992025518341, "grad_norm": 0.21761161093235726, "learning_rate": 2.2576832151300238e-05, "loss": 0.3616, "step": 1117 }, { "epoch": 1.7830940988835726, "grad_norm": 0.2185267666669683, "learning_rate": 2.254728132387707e-05, "loss": 0.3667, "step": 1118 }, { "epoch": 1.784688995215311, "grad_norm": 0.209611108188423, "learning_rate": 2.25177304964539e-05, "loss": 0.3462, "step": 1119 }, { "epoch": 1.7862838915470496, "grad_norm": 0.23165734104077176, "learning_rate": 2.2488179669030734e-05, "loss": 0.3776, "step": 1120 }, { "epoch": 1.7878787878787878, "grad_norm": 0.2175213328568231, "learning_rate": 2.2458628841607564e-05, "loss": 0.3709, "step": 1121 }, { "epoch": 1.7894736842105263, "grad_norm": 0.20006728329203025, "learning_rate": 2.24290780141844e-05, "loss": 0.3677, "step": 1122 }, { "epoch": 1.7910685805422646, "grad_norm": 0.22135146953594048, "learning_rate": 2.239952718676123e-05, "loss": 0.3551, "step": 1123 }, { "epoch": 1.792663476874003, "grad_norm": 0.23581085660978657, "learning_rate": 2.2369976359338063e-05, "loss": 0.3582, "step": 1124 }, { "epoch": 1.7942583732057416, "grad_norm": 0.20248215759754526, "learning_rate": 2.2340425531914894e-05, "loss": 0.3541, "step": 1125 }, { "epoch": 1.79585326953748, "grad_norm": 0.2163944935660489, "learning_rate": 2.2310874704491728e-05, "loss": 0.3524, "step": 1126 }, { "epoch": 1.7974481658692185, "grad_norm": 0.24671218905780837, "learning_rate": 2.228132387706856e-05, "loss": 0.3713, "step": 1127 }, { "epoch": 1.799043062200957, "grad_norm": 0.21219691441301222, "learning_rate": 2.2251773049645393e-05, "loss": 0.3713, "step": 1128 }, { "epoch": 1.8006379585326955, "grad_norm": 0.22668961968720344, "learning_rate": 2.2222222222222223e-05, "loss": 0.3566, "step": 1129 }, { "epoch": 1.802232854864434, "grad_norm": 0.24079179080619398, "learning_rate": 2.2192671394799054e-05, "loss": 0.3587, "step": 1130 }, { "epoch": 1.8038277511961722, "grad_norm": 0.20691565297004233, "learning_rate": 2.2163120567375885e-05, "loss": 0.3718, "step": 1131 }, { "epoch": 1.8054226475279107, "grad_norm": 0.23800746009961052, "learning_rate": 2.213356973995272e-05, "loss": 0.3535, "step": 1132 }, { "epoch": 1.807017543859649, "grad_norm": 0.21065000410833193, "learning_rate": 2.2104018912529553e-05, "loss": 0.3625, "step": 1133 }, { "epoch": 1.8086124401913874, "grad_norm": 0.24720976391895755, "learning_rate": 2.2074468085106383e-05, "loss": 0.3874, "step": 1134 }, { "epoch": 1.810207336523126, "grad_norm": 0.2205955259157615, "learning_rate": 2.2044917257683217e-05, "loss": 0.3711, "step": 1135 }, { "epoch": 1.8118022328548644, "grad_norm": 0.2336639506148841, "learning_rate": 2.2015366430260048e-05, "loss": 0.3834, "step": 1136 }, { "epoch": 1.813397129186603, "grad_norm": 0.23494091617994817, "learning_rate": 2.1985815602836882e-05, "loss": 0.3632, "step": 1137 }, { "epoch": 1.8149920255183414, "grad_norm": 0.20944342747066225, "learning_rate": 2.1956264775413713e-05, "loss": 0.3728, "step": 1138 }, { "epoch": 1.8165869218500799, "grad_norm": 0.21109628210895187, "learning_rate": 2.1926713947990547e-05, "loss": 0.3793, "step": 1139 }, { "epoch": 1.8181818181818183, "grad_norm": 0.21760988869624837, "learning_rate": 2.1897163120567378e-05, "loss": 0.3643, "step": 1140 }, { "epoch": 1.8197767145135566, "grad_norm": 0.2232367417816179, "learning_rate": 2.186761229314421e-05, "loss": 0.36, "step": 1141 }, { "epoch": 1.821371610845295, "grad_norm": 0.23123506521690906, "learning_rate": 2.1838061465721042e-05, "loss": 0.3654, "step": 1142 }, { "epoch": 1.8229665071770333, "grad_norm": 0.23474553435480439, "learning_rate": 2.1808510638297873e-05, "loss": 0.3698, "step": 1143 }, { "epoch": 1.8245614035087718, "grad_norm": 0.21347420630529704, "learning_rate": 2.1778959810874704e-05, "loss": 0.3596, "step": 1144 }, { "epoch": 1.8261562998405103, "grad_norm": 0.2397938992569051, "learning_rate": 2.1749408983451538e-05, "loss": 0.3771, "step": 1145 }, { "epoch": 1.8277511961722488, "grad_norm": 0.24602180084444822, "learning_rate": 2.171985815602837e-05, "loss": 0.3789, "step": 1146 }, { "epoch": 1.8293460925039873, "grad_norm": 0.2189084180402523, "learning_rate": 2.1690307328605202e-05, "loss": 0.3573, "step": 1147 }, { "epoch": 1.8309409888357258, "grad_norm": 0.23173712144307926, "learning_rate": 2.1660756501182033e-05, "loss": 0.3598, "step": 1148 }, { "epoch": 1.8325358851674642, "grad_norm": 0.2075753895912378, "learning_rate": 2.1631205673758867e-05, "loss": 0.3606, "step": 1149 }, { "epoch": 1.8341307814992025, "grad_norm": 0.21275081211739005, "learning_rate": 2.1601654846335698e-05, "loss": 0.3662, "step": 1150 }, { "epoch": 1.835725677830941, "grad_norm": 0.23725200350761486, "learning_rate": 2.1572104018912532e-05, "loss": 0.3715, "step": 1151 }, { "epoch": 1.8373205741626795, "grad_norm": 0.22366307590524695, "learning_rate": 2.1542553191489363e-05, "loss": 0.3842, "step": 1152 }, { "epoch": 1.8389154704944177, "grad_norm": 0.2156313771480598, "learning_rate": 2.1513002364066197e-05, "loss": 0.3686, "step": 1153 }, { "epoch": 1.8405103668261562, "grad_norm": 0.2516422268125818, "learning_rate": 2.1483451536643027e-05, "loss": 0.343, "step": 1154 }, { "epoch": 1.8421052631578947, "grad_norm": 0.22593058607522873, "learning_rate": 2.1453900709219858e-05, "loss": 0.3684, "step": 1155 }, { "epoch": 1.8437001594896332, "grad_norm": 0.1957139414101018, "learning_rate": 2.142434988179669e-05, "loss": 0.3591, "step": 1156 }, { "epoch": 1.8452950558213717, "grad_norm": 0.23303267708690414, "learning_rate": 2.1394799054373523e-05, "loss": 0.3652, "step": 1157 }, { "epoch": 1.8468899521531101, "grad_norm": 0.22074977397560983, "learning_rate": 2.1365248226950353e-05, "loss": 0.3616, "step": 1158 }, { "epoch": 1.8484848484848486, "grad_norm": 0.22447073032105577, "learning_rate": 2.1335697399527187e-05, "loss": 0.3727, "step": 1159 }, { "epoch": 1.8500797448165869, "grad_norm": 0.21900291961568127, "learning_rate": 2.1306146572104018e-05, "loss": 0.3653, "step": 1160 }, { "epoch": 1.8516746411483254, "grad_norm": 0.20590548214752227, "learning_rate": 2.1276595744680852e-05, "loss": 0.369, "step": 1161 }, { "epoch": 1.8532695374800638, "grad_norm": 0.22733428976573516, "learning_rate": 2.1247044917257683e-05, "loss": 0.3757, "step": 1162 }, { "epoch": 1.854864433811802, "grad_norm": 0.22043562368500438, "learning_rate": 2.1217494089834517e-05, "loss": 0.3611, "step": 1163 }, { "epoch": 1.8564593301435406, "grad_norm": 0.19457023394314785, "learning_rate": 2.1187943262411348e-05, "loss": 0.3581, "step": 1164 }, { "epoch": 1.858054226475279, "grad_norm": 0.21239827349602938, "learning_rate": 2.115839243498818e-05, "loss": 0.3705, "step": 1165 }, { "epoch": 1.8596491228070176, "grad_norm": 0.20539034669775078, "learning_rate": 2.1128841607565016e-05, "loss": 0.3767, "step": 1166 }, { "epoch": 1.861244019138756, "grad_norm": 0.2213992238290749, "learning_rate": 2.1099290780141846e-05, "loss": 0.3615, "step": 1167 }, { "epoch": 1.8628389154704945, "grad_norm": 0.2514908496680851, "learning_rate": 2.1069739952718677e-05, "loss": 0.369, "step": 1168 }, { "epoch": 1.864433811802233, "grad_norm": 0.20434719078127045, "learning_rate": 2.1040189125295508e-05, "loss": 0.3463, "step": 1169 }, { "epoch": 1.8660287081339713, "grad_norm": 0.21887669266262366, "learning_rate": 2.1010638297872342e-05, "loss": 0.3597, "step": 1170 }, { "epoch": 1.8676236044657097, "grad_norm": 0.19587414983918255, "learning_rate": 2.0981087470449173e-05, "loss": 0.362, "step": 1171 }, { "epoch": 1.869218500797448, "grad_norm": 0.19519121532045125, "learning_rate": 2.0951536643026007e-05, "loss": 0.3646, "step": 1172 }, { "epoch": 1.8708133971291865, "grad_norm": 0.22848651964788727, "learning_rate": 2.0921985815602837e-05, "loss": 0.368, "step": 1173 }, { "epoch": 1.872408293460925, "grad_norm": 0.20813105276236818, "learning_rate": 2.089243498817967e-05, "loss": 0.3672, "step": 1174 }, { "epoch": 1.8740031897926634, "grad_norm": 0.2220053958774283, "learning_rate": 2.0862884160756502e-05, "loss": 0.375, "step": 1175 }, { "epoch": 1.875598086124402, "grad_norm": 0.23500437587765838, "learning_rate": 2.0833333333333336e-05, "loss": 0.3871, "step": 1176 }, { "epoch": 1.8771929824561404, "grad_norm": 0.20464312916845062, "learning_rate": 2.0803782505910167e-05, "loss": 0.3745, "step": 1177 }, { "epoch": 1.878787878787879, "grad_norm": 0.1930245247547481, "learning_rate": 2.0774231678487e-05, "loss": 0.351, "step": 1178 }, { "epoch": 1.8803827751196174, "grad_norm": 0.21359361979591335, "learning_rate": 2.074468085106383e-05, "loss": 0.3635, "step": 1179 }, { "epoch": 1.8819776714513556, "grad_norm": 0.19056423255673252, "learning_rate": 2.0715130023640662e-05, "loss": 0.3552, "step": 1180 }, { "epoch": 1.8835725677830941, "grad_norm": 0.1965072756820685, "learning_rate": 2.0685579196217493e-05, "loss": 0.3503, "step": 1181 }, { "epoch": 1.8851674641148324, "grad_norm": 0.2003887931007665, "learning_rate": 2.0656028368794327e-05, "loss": 0.3621, "step": 1182 }, { "epoch": 1.8867623604465709, "grad_norm": 0.18475318096263882, "learning_rate": 2.0626477541371158e-05, "loss": 0.3514, "step": 1183 }, { "epoch": 1.8883572567783093, "grad_norm": 0.21468678529141882, "learning_rate": 2.059692671394799e-05, "loss": 0.4033, "step": 1184 }, { "epoch": 1.8899521531100478, "grad_norm": 0.2362501836205584, "learning_rate": 2.0567375886524822e-05, "loss": 0.3977, "step": 1185 }, { "epoch": 1.8915470494417863, "grad_norm": 0.2111246964667102, "learning_rate": 2.0537825059101656e-05, "loss": 0.3488, "step": 1186 }, { "epoch": 1.8931419457735248, "grad_norm": 0.22321004206044404, "learning_rate": 2.0508274231678487e-05, "loss": 0.3607, "step": 1187 }, { "epoch": 1.8947368421052633, "grad_norm": 0.22586424263363938, "learning_rate": 2.047872340425532e-05, "loss": 0.37, "step": 1188 }, { "epoch": 1.8963317384370018, "grad_norm": 0.19337381439639317, "learning_rate": 2.0449172576832152e-05, "loss": 0.3585, "step": 1189 }, { "epoch": 1.89792663476874, "grad_norm": 0.2071060897168956, "learning_rate": 2.0419621749408986e-05, "loss": 0.3487, "step": 1190 }, { "epoch": 1.8995215311004785, "grad_norm": 0.19395555269221817, "learning_rate": 2.0390070921985816e-05, "loss": 0.3627, "step": 1191 }, { "epoch": 1.9011164274322168, "grad_norm": 0.19776819862520365, "learning_rate": 2.036052009456265e-05, "loss": 0.364, "step": 1192 }, { "epoch": 1.9027113237639552, "grad_norm": 0.2099926261343192, "learning_rate": 2.033096926713948e-05, "loss": 0.3478, "step": 1193 }, { "epoch": 1.9043062200956937, "grad_norm": 0.21906024883921313, "learning_rate": 2.0301418439716312e-05, "loss": 0.3629, "step": 1194 }, { "epoch": 1.9059011164274322, "grad_norm": 0.20170031710652614, "learning_rate": 2.0271867612293143e-05, "loss": 0.3591, "step": 1195 }, { "epoch": 1.9074960127591707, "grad_norm": 0.22831394932808388, "learning_rate": 2.0242316784869977e-05, "loss": 0.3552, "step": 1196 }, { "epoch": 1.9090909090909092, "grad_norm": 0.2103977663261078, "learning_rate": 2.0212765957446807e-05, "loss": 0.3584, "step": 1197 }, { "epoch": 1.9106858054226477, "grad_norm": 0.20840327681714838, "learning_rate": 2.018321513002364e-05, "loss": 0.3635, "step": 1198 }, { "epoch": 1.912280701754386, "grad_norm": 0.21780905635669093, "learning_rate": 2.0153664302600475e-05, "loss": 0.3508, "step": 1199 }, { "epoch": 1.9138755980861244, "grad_norm": 0.2276857206643273, "learning_rate": 2.0124113475177306e-05, "loss": 0.3696, "step": 1200 }, { "epoch": 1.9154704944178629, "grad_norm": 0.23026483314645596, "learning_rate": 2.009456264775414e-05, "loss": 0.3712, "step": 1201 }, { "epoch": 1.9170653907496011, "grad_norm": 0.21535530477893977, "learning_rate": 2.006501182033097e-05, "loss": 0.3669, "step": 1202 }, { "epoch": 1.9186602870813396, "grad_norm": 0.2006097026405861, "learning_rate": 2.0035460992907805e-05, "loss": 0.3697, "step": 1203 }, { "epoch": 1.920255183413078, "grad_norm": 0.21306675145409854, "learning_rate": 2.0005910165484636e-05, "loss": 0.3695, "step": 1204 }, { "epoch": 1.9218500797448166, "grad_norm": 0.2276407703099162, "learning_rate": 1.9976359338061466e-05, "loss": 0.3527, "step": 1205 }, { "epoch": 1.923444976076555, "grad_norm": 0.22280588096299894, "learning_rate": 1.9946808510638297e-05, "loss": 0.3501, "step": 1206 }, { "epoch": 1.9250398724082936, "grad_norm": 0.21056313742586114, "learning_rate": 1.991725768321513e-05, "loss": 0.3561, "step": 1207 }, { "epoch": 1.926634768740032, "grad_norm": 0.19698963778401163, "learning_rate": 1.988770685579196e-05, "loss": 0.343, "step": 1208 }, { "epoch": 1.9282296650717703, "grad_norm": 0.21023773806683652, "learning_rate": 1.9858156028368796e-05, "loss": 0.3914, "step": 1209 }, { "epoch": 1.9298245614035088, "grad_norm": 0.2504757159085716, "learning_rate": 1.9828605200945626e-05, "loss": 0.3833, "step": 1210 }, { "epoch": 1.9314194577352473, "grad_norm": 0.22778760536565357, "learning_rate": 1.979905437352246e-05, "loss": 0.3701, "step": 1211 }, { "epoch": 1.9330143540669855, "grad_norm": 0.20381209256787614, "learning_rate": 1.976950354609929e-05, "loss": 0.3649, "step": 1212 }, { "epoch": 1.934609250398724, "grad_norm": 0.22354421684057005, "learning_rate": 1.9739952718676125e-05, "loss": 0.3654, "step": 1213 }, { "epoch": 1.9362041467304625, "grad_norm": 0.24036459835145102, "learning_rate": 1.9710401891252956e-05, "loss": 0.3615, "step": 1214 }, { "epoch": 1.937799043062201, "grad_norm": 0.20681904945094767, "learning_rate": 1.968085106382979e-05, "loss": 0.3761, "step": 1215 }, { "epoch": 1.9393939393939394, "grad_norm": 0.21156380524768, "learning_rate": 1.965130023640662e-05, "loss": 0.3548, "step": 1216 }, { "epoch": 1.940988835725678, "grad_norm": 0.23899045788935586, "learning_rate": 1.9621749408983455e-05, "loss": 0.3733, "step": 1217 }, { "epoch": 1.9425837320574164, "grad_norm": 0.22838552654221903, "learning_rate": 1.9592198581560285e-05, "loss": 0.3421, "step": 1218 }, { "epoch": 1.9441786283891547, "grad_norm": 0.21222057330678146, "learning_rate": 1.9562647754137116e-05, "loss": 0.3549, "step": 1219 }, { "epoch": 1.9457735247208932, "grad_norm": 0.18862978542646014, "learning_rate": 1.9533096926713947e-05, "loss": 0.3589, "step": 1220 }, { "epoch": 1.9473684210526314, "grad_norm": 0.2240986456779227, "learning_rate": 1.950354609929078e-05, "loss": 0.3564, "step": 1221 }, { "epoch": 1.94896331738437, "grad_norm": 0.21925476527208954, "learning_rate": 1.947399527186761e-05, "loss": 0.365, "step": 1222 }, { "epoch": 1.9505582137161084, "grad_norm": 0.2072172757543059, "learning_rate": 1.9444444444444445e-05, "loss": 0.35, "step": 1223 }, { "epoch": 1.9521531100478469, "grad_norm": 0.2094580748421578, "learning_rate": 1.9414893617021276e-05, "loss": 0.3711, "step": 1224 }, { "epoch": 1.9537480063795853, "grad_norm": 0.20294199811985142, "learning_rate": 1.938534278959811e-05, "loss": 0.371, "step": 1225 }, { "epoch": 1.9553429027113238, "grad_norm": 0.2333803868144073, "learning_rate": 1.935579196217494e-05, "loss": 0.3712, "step": 1226 }, { "epoch": 1.9569377990430623, "grad_norm": 0.2184787379898616, "learning_rate": 1.9326241134751775e-05, "loss": 0.3419, "step": 1227 }, { "epoch": 1.9585326953748008, "grad_norm": 0.22537412531652495, "learning_rate": 1.9296690307328606e-05, "loss": 0.3456, "step": 1228 }, { "epoch": 1.960127591706539, "grad_norm": 0.19527539159522414, "learning_rate": 1.926713947990544e-05, "loss": 0.3577, "step": 1229 }, { "epoch": 1.9617224880382775, "grad_norm": 0.20612837221294056, "learning_rate": 1.923758865248227e-05, "loss": 0.3474, "step": 1230 }, { "epoch": 1.9633173843700158, "grad_norm": 0.20980857889202217, "learning_rate": 1.92080378250591e-05, "loss": 0.3674, "step": 1231 }, { "epoch": 1.9649122807017543, "grad_norm": 0.21129626785433953, "learning_rate": 1.9178486997635935e-05, "loss": 0.366, "step": 1232 }, { "epoch": 1.9665071770334928, "grad_norm": 0.21287675183652235, "learning_rate": 1.9148936170212766e-05, "loss": 0.3716, "step": 1233 }, { "epoch": 1.9681020733652312, "grad_norm": 0.19978242463537046, "learning_rate": 1.91193853427896e-05, "loss": 0.3583, "step": 1234 }, { "epoch": 1.9696969696969697, "grad_norm": 0.19509075697607434, "learning_rate": 1.908983451536643e-05, "loss": 0.3571, "step": 1235 }, { "epoch": 1.9712918660287082, "grad_norm": 0.20339374487096457, "learning_rate": 1.9060283687943264e-05, "loss": 0.3631, "step": 1236 }, { "epoch": 1.9728867623604467, "grad_norm": 0.20692252071892808, "learning_rate": 1.9030732860520095e-05, "loss": 0.3533, "step": 1237 }, { "epoch": 1.9744816586921852, "grad_norm": 0.2020126359871856, "learning_rate": 1.900118203309693e-05, "loss": 0.3677, "step": 1238 }, { "epoch": 1.9760765550239234, "grad_norm": 0.18561398854058006, "learning_rate": 1.897163120567376e-05, "loss": 0.3537, "step": 1239 }, { "epoch": 1.977671451355662, "grad_norm": 0.1906270355769171, "learning_rate": 1.8942080378250594e-05, "loss": 0.3458, "step": 1240 }, { "epoch": 1.9792663476874002, "grad_norm": 0.21054556989959616, "learning_rate": 1.8912529550827425e-05, "loss": 0.3756, "step": 1241 }, { "epoch": 1.9808612440191387, "grad_norm": 0.20168646009148083, "learning_rate": 1.888297872340426e-05, "loss": 0.3555, "step": 1242 }, { "epoch": 1.9824561403508771, "grad_norm": 0.2128734720807032, "learning_rate": 1.885342789598109e-05, "loss": 0.3526, "step": 1243 }, { "epoch": 1.9840510366826156, "grad_norm": 0.23567057646931902, "learning_rate": 1.882387706855792e-05, "loss": 0.3865, "step": 1244 }, { "epoch": 1.985645933014354, "grad_norm": 0.19258613384430684, "learning_rate": 1.879432624113475e-05, "loss": 0.3625, "step": 1245 }, { "epoch": 1.9872408293460926, "grad_norm": 0.21356290717847992, "learning_rate": 1.8764775413711585e-05, "loss": 0.3677, "step": 1246 }, { "epoch": 1.988835725677831, "grad_norm": 0.23224951279487915, "learning_rate": 1.8735224586288415e-05, "loss": 0.3659, "step": 1247 }, { "epoch": 1.9904306220095693, "grad_norm": 0.19051571003971499, "learning_rate": 1.870567375886525e-05, "loss": 0.3607, "step": 1248 }, { "epoch": 1.9920255183413078, "grad_norm": 0.20515222571722486, "learning_rate": 1.867612293144208e-05, "loss": 0.3785, "step": 1249 }, { "epoch": 1.9936204146730463, "grad_norm": 0.2034537210496292, "learning_rate": 1.8646572104018914e-05, "loss": 0.3611, "step": 1250 }, { "epoch": 1.9952153110047846, "grad_norm": 0.19054342734771174, "learning_rate": 1.8617021276595745e-05, "loss": 0.3657, "step": 1251 }, { "epoch": 1.996810207336523, "grad_norm": 0.19131687528489824, "learning_rate": 1.858747044917258e-05, "loss": 0.3549, "step": 1252 }, { "epoch": 1.9984051036682615, "grad_norm": 0.1845228820641462, "learning_rate": 1.855791962174941e-05, "loss": 0.3697, "step": 1253 }, { "epoch": 2.0, "grad_norm": 0.20822106485055106, "learning_rate": 1.8528368794326244e-05, "loss": 0.3436, "step": 1254 }, { "epoch": 2.0015948963317385, "grad_norm": 0.2849190491479579, "learning_rate": 1.8498817966903074e-05, "loss": 0.2928, "step": 1255 }, { "epoch": 2.003189792663477, "grad_norm": 0.22937673632370936, "learning_rate": 1.8469267139479905e-05, "loss": 0.2921, "step": 1256 }, { "epoch": 2.0047846889952154, "grad_norm": 0.2521055976969963, "learning_rate": 1.8439716312056736e-05, "loss": 0.2793, "step": 1257 }, { "epoch": 2.006379585326954, "grad_norm": 0.34892193343465006, "learning_rate": 1.841016548463357e-05, "loss": 0.2879, "step": 1258 }, { "epoch": 2.007974481658692, "grad_norm": 0.21682664107427993, "learning_rate": 1.83806146572104e-05, "loss": 0.2828, "step": 1259 }, { "epoch": 2.0095693779904304, "grad_norm": 0.26455959997386647, "learning_rate": 1.8351063829787234e-05, "loss": 0.292, "step": 1260 }, { "epoch": 2.011164274322169, "grad_norm": 0.33918711065900525, "learning_rate": 1.8321513002364065e-05, "loss": 0.281, "step": 1261 }, { "epoch": 2.0127591706539074, "grad_norm": 0.24299520668647306, "learning_rate": 1.82919621749409e-05, "loss": 0.2718, "step": 1262 }, { "epoch": 2.014354066985646, "grad_norm": 0.24012673466578743, "learning_rate": 1.8262411347517733e-05, "loss": 0.2702, "step": 1263 }, { "epoch": 2.0159489633173844, "grad_norm": 0.34092699565463513, "learning_rate": 1.8232860520094564e-05, "loss": 0.3071, "step": 1264 }, { "epoch": 2.017543859649123, "grad_norm": 0.23908769569451171, "learning_rate": 1.8203309692671398e-05, "loss": 0.2927, "step": 1265 }, { "epoch": 2.0191387559808613, "grad_norm": 0.20355989320570084, "learning_rate": 1.817375886524823e-05, "loss": 0.2815, "step": 1266 }, { "epoch": 2.0207336523126, "grad_norm": 0.23854033007528547, "learning_rate": 1.8144208037825063e-05, "loss": 0.27, "step": 1267 }, { "epoch": 2.0223285486443383, "grad_norm": 0.2650850111216802, "learning_rate": 1.8114657210401893e-05, "loss": 0.3034, "step": 1268 }, { "epoch": 2.0239234449760763, "grad_norm": 0.19891520218136072, "learning_rate": 1.8085106382978724e-05, "loss": 0.2852, "step": 1269 }, { "epoch": 2.025518341307815, "grad_norm": 0.21389797420191153, "learning_rate": 1.8055555555555555e-05, "loss": 0.2681, "step": 1270 }, { "epoch": 2.0271132376395533, "grad_norm": 0.2332482324251739, "learning_rate": 1.802600472813239e-05, "loss": 0.2776, "step": 1271 }, { "epoch": 2.028708133971292, "grad_norm": 0.2436350048866769, "learning_rate": 1.799645390070922e-05, "loss": 0.2831, "step": 1272 }, { "epoch": 2.0303030303030303, "grad_norm": 0.21714595731617656, "learning_rate": 1.7966903073286054e-05, "loss": 0.2817, "step": 1273 }, { "epoch": 2.0318979266347688, "grad_norm": 0.26128740050213645, "learning_rate": 1.7937352245862884e-05, "loss": 0.2896, "step": 1274 }, { "epoch": 2.0334928229665072, "grad_norm": 0.2148662501552406, "learning_rate": 1.7907801418439718e-05, "loss": 0.285, "step": 1275 }, { "epoch": 2.0350877192982457, "grad_norm": 0.22240662997549906, "learning_rate": 1.787825059101655e-05, "loss": 0.2748, "step": 1276 }, { "epoch": 2.036682615629984, "grad_norm": 0.24193581800562913, "learning_rate": 1.7848699763593383e-05, "loss": 0.2861, "step": 1277 }, { "epoch": 2.0382775119617227, "grad_norm": 0.21976567995278926, "learning_rate": 1.7819148936170214e-05, "loss": 0.2901, "step": 1278 }, { "epoch": 2.0398724082934607, "grad_norm": 0.2147859240511922, "learning_rate": 1.7789598108747048e-05, "loss": 0.283, "step": 1279 }, { "epoch": 2.041467304625199, "grad_norm": 0.21168731630156393, "learning_rate": 1.776004728132388e-05, "loss": 0.2935, "step": 1280 }, { "epoch": 2.0430622009569377, "grad_norm": 0.21746367286508522, "learning_rate": 1.773049645390071e-05, "loss": 0.2842, "step": 1281 }, { "epoch": 2.044657097288676, "grad_norm": 0.20319608547869641, "learning_rate": 1.770094562647754e-05, "loss": 0.2935, "step": 1282 }, { "epoch": 2.0462519936204147, "grad_norm": 0.20486373533703883, "learning_rate": 1.7671394799054374e-05, "loss": 0.2952, "step": 1283 }, { "epoch": 2.047846889952153, "grad_norm": 0.19920809595136751, "learning_rate": 1.7641843971631204e-05, "loss": 0.2661, "step": 1284 }, { "epoch": 2.0494417862838916, "grad_norm": 0.1876223483832252, "learning_rate": 1.761229314420804e-05, "loss": 0.267, "step": 1285 }, { "epoch": 2.05103668261563, "grad_norm": 0.1947931886768685, "learning_rate": 1.758274231678487e-05, "loss": 0.2642, "step": 1286 }, { "epoch": 2.0526315789473686, "grad_norm": 0.19561103032606642, "learning_rate": 1.7553191489361703e-05, "loss": 0.2897, "step": 1287 }, { "epoch": 2.054226475279107, "grad_norm": 0.21125936852986343, "learning_rate": 1.7523640661938534e-05, "loss": 0.2983, "step": 1288 }, { "epoch": 2.055821371610845, "grad_norm": 0.19976290531802215, "learning_rate": 1.7494089834515368e-05, "loss": 0.2985, "step": 1289 }, { "epoch": 2.0574162679425836, "grad_norm": 0.20512877061912477, "learning_rate": 1.74645390070922e-05, "loss": 0.2898, "step": 1290 }, { "epoch": 2.059011164274322, "grad_norm": 0.20462716094105599, "learning_rate": 1.7434988179669033e-05, "loss": 0.2959, "step": 1291 }, { "epoch": 2.0606060606060606, "grad_norm": 0.19120913318236293, "learning_rate": 1.7405437352245863e-05, "loss": 0.27, "step": 1292 }, { "epoch": 2.062200956937799, "grad_norm": 0.19287029093112681, "learning_rate": 1.7375886524822697e-05, "loss": 0.2746, "step": 1293 }, { "epoch": 2.0637958532695375, "grad_norm": 0.2198935062801443, "learning_rate": 1.7346335697399528e-05, "loss": 0.2842, "step": 1294 }, { "epoch": 2.065390749601276, "grad_norm": 0.20863933736599855, "learning_rate": 1.731678486997636e-05, "loss": 0.2865, "step": 1295 }, { "epoch": 2.0669856459330145, "grad_norm": 0.18842779708265273, "learning_rate": 1.7287234042553193e-05, "loss": 0.278, "step": 1296 }, { "epoch": 2.068580542264753, "grad_norm": 0.2273624052200579, "learning_rate": 1.7257683215130024e-05, "loss": 0.2967, "step": 1297 }, { "epoch": 2.0701754385964914, "grad_norm": 0.1884279093732341, "learning_rate": 1.7228132387706858e-05, "loss": 0.289, "step": 1298 }, { "epoch": 2.0717703349282295, "grad_norm": 0.20411973942769812, "learning_rate": 1.7198581560283688e-05, "loss": 0.2902, "step": 1299 }, { "epoch": 2.073365231259968, "grad_norm": 0.19805970177101884, "learning_rate": 1.7169030732860522e-05, "loss": 0.2899, "step": 1300 }, { "epoch": 2.0749601275917064, "grad_norm": 0.1984139314909351, "learning_rate": 1.7139479905437353e-05, "loss": 0.2841, "step": 1301 }, { "epoch": 2.076555023923445, "grad_norm": 0.20389271034444423, "learning_rate": 1.7109929078014187e-05, "loss": 0.2744, "step": 1302 }, { "epoch": 2.0781499202551834, "grad_norm": 0.18605268287645366, "learning_rate": 1.7080378250591018e-05, "loss": 0.2591, "step": 1303 }, { "epoch": 2.079744816586922, "grad_norm": 0.19129928768466553, "learning_rate": 1.7050827423167852e-05, "loss": 0.2805, "step": 1304 }, { "epoch": 2.0813397129186604, "grad_norm": 0.2160399972482917, "learning_rate": 1.7021276595744682e-05, "loss": 0.2933, "step": 1305 }, { "epoch": 2.082934609250399, "grad_norm": 0.20505350005804443, "learning_rate": 1.6991725768321513e-05, "loss": 0.2905, "step": 1306 }, { "epoch": 2.0845295055821373, "grad_norm": 0.2106096401906927, "learning_rate": 1.6962174940898344e-05, "loss": 0.2812, "step": 1307 }, { "epoch": 2.0861244019138754, "grad_norm": 0.20023171468169892, "learning_rate": 1.6932624113475178e-05, "loss": 0.2876, "step": 1308 }, { "epoch": 2.087719298245614, "grad_norm": 0.21394105046680442, "learning_rate": 1.690307328605201e-05, "loss": 0.2797, "step": 1309 }, { "epoch": 2.0893141945773523, "grad_norm": 0.20904785118877023, "learning_rate": 1.6873522458628843e-05, "loss": 0.2893, "step": 1310 }, { "epoch": 2.090909090909091, "grad_norm": 0.22724801041098447, "learning_rate": 1.6843971631205673e-05, "loss": 0.2841, "step": 1311 }, { "epoch": 2.0925039872408293, "grad_norm": 0.20530285561491093, "learning_rate": 1.6814420803782507e-05, "loss": 0.2801, "step": 1312 }, { "epoch": 2.094098883572568, "grad_norm": 0.22542003932749957, "learning_rate": 1.6784869976359338e-05, "loss": 0.2824, "step": 1313 }, { "epoch": 2.0956937799043063, "grad_norm": 0.21075391916202688, "learning_rate": 1.6755319148936172e-05, "loss": 0.2922, "step": 1314 }, { "epoch": 2.0972886762360448, "grad_norm": 0.19545297941114959, "learning_rate": 1.6725768321513003e-05, "loss": 0.2781, "step": 1315 }, { "epoch": 2.0988835725677832, "grad_norm": 0.23359873365149852, "learning_rate": 1.6696217494089837e-05, "loss": 0.3035, "step": 1316 }, { "epoch": 2.1004784688995217, "grad_norm": 0.21276086411755243, "learning_rate": 1.6666666666666667e-05, "loss": 0.3037, "step": 1317 }, { "epoch": 2.1020733652312598, "grad_norm": 0.20857423989785753, "learning_rate": 1.66371158392435e-05, "loss": 0.2679, "step": 1318 }, { "epoch": 2.1036682615629982, "grad_norm": 0.19543214745574197, "learning_rate": 1.6607565011820332e-05, "loss": 0.2953, "step": 1319 }, { "epoch": 2.1052631578947367, "grad_norm": 0.19159016484409044, "learning_rate": 1.6578014184397163e-05, "loss": 0.2584, "step": 1320 }, { "epoch": 2.106858054226475, "grad_norm": 0.22327627856048726, "learning_rate": 1.6548463356973994e-05, "loss": 0.2956, "step": 1321 }, { "epoch": 2.1084529505582137, "grad_norm": 0.19444720550168898, "learning_rate": 1.6518912529550828e-05, "loss": 0.2807, "step": 1322 }, { "epoch": 2.110047846889952, "grad_norm": 0.1900549683375532, "learning_rate": 1.6489361702127658e-05, "loss": 0.2753, "step": 1323 }, { "epoch": 2.1116427432216907, "grad_norm": 0.20258808496025404, "learning_rate": 1.6459810874704492e-05, "loss": 0.2835, "step": 1324 }, { "epoch": 2.113237639553429, "grad_norm": 0.1951243752052535, "learning_rate": 1.6430260047281323e-05, "loss": 0.2801, "step": 1325 }, { "epoch": 2.1148325358851676, "grad_norm": 0.2123759112831173, "learning_rate": 1.6400709219858157e-05, "loss": 0.2955, "step": 1326 }, { "epoch": 2.116427432216906, "grad_norm": 0.19662084525654264, "learning_rate": 1.6371158392434988e-05, "loss": 0.2847, "step": 1327 }, { "epoch": 2.118022328548644, "grad_norm": 0.2118221941054082, "learning_rate": 1.6341607565011822e-05, "loss": 0.3025, "step": 1328 }, { "epoch": 2.1196172248803826, "grad_norm": 0.20189829864950928, "learning_rate": 1.6312056737588656e-05, "loss": 0.2801, "step": 1329 }, { "epoch": 2.121212121212121, "grad_norm": 0.19632480538182429, "learning_rate": 1.6282505910165487e-05, "loss": 0.2752, "step": 1330 }, { "epoch": 2.1228070175438596, "grad_norm": 0.19983522469152906, "learning_rate": 1.6252955082742317e-05, "loss": 0.2788, "step": 1331 }, { "epoch": 2.124401913875598, "grad_norm": 0.1928434703805117, "learning_rate": 1.6223404255319148e-05, "loss": 0.2694, "step": 1332 }, { "epoch": 2.1259968102073366, "grad_norm": 0.1943454991056505, "learning_rate": 1.6193853427895982e-05, "loss": 0.2813, "step": 1333 }, { "epoch": 2.127591706539075, "grad_norm": 0.19893504003928078, "learning_rate": 1.6164302600472813e-05, "loss": 0.3024, "step": 1334 }, { "epoch": 2.1291866028708135, "grad_norm": 0.1939095764626671, "learning_rate": 1.6134751773049647e-05, "loss": 0.2823, "step": 1335 }, { "epoch": 2.130781499202552, "grad_norm": 0.18552555225682718, "learning_rate": 1.6105200945626477e-05, "loss": 0.2708, "step": 1336 }, { "epoch": 2.1323763955342905, "grad_norm": 0.1954586935512692, "learning_rate": 1.607565011820331e-05, "loss": 0.2846, "step": 1337 }, { "epoch": 2.1339712918660285, "grad_norm": 0.19031945625778263, "learning_rate": 1.6046099290780142e-05, "loss": 0.303, "step": 1338 }, { "epoch": 2.135566188197767, "grad_norm": 0.19490022758794356, "learning_rate": 1.6016548463356976e-05, "loss": 0.2805, "step": 1339 }, { "epoch": 2.1371610845295055, "grad_norm": 0.19960134842331823, "learning_rate": 1.5986997635933807e-05, "loss": 0.286, "step": 1340 }, { "epoch": 2.138755980861244, "grad_norm": 0.1860721737972585, "learning_rate": 1.595744680851064e-05, "loss": 0.2634, "step": 1341 }, { "epoch": 2.1403508771929824, "grad_norm": 0.21629602097390996, "learning_rate": 1.592789598108747e-05, "loss": 0.2834, "step": 1342 }, { "epoch": 2.141945773524721, "grad_norm": 0.2040913997838003, "learning_rate": 1.5898345153664306e-05, "loss": 0.2905, "step": 1343 }, { "epoch": 2.1435406698564594, "grad_norm": 0.2013353899244977, "learning_rate": 1.5868794326241136e-05, "loss": 0.2792, "step": 1344 }, { "epoch": 2.145135566188198, "grad_norm": 0.2267916495005366, "learning_rate": 1.5839243498817967e-05, "loss": 0.2909, "step": 1345 }, { "epoch": 2.1467304625199364, "grad_norm": 0.20284835388492686, "learning_rate": 1.5809692671394798e-05, "loss": 0.2841, "step": 1346 }, { "epoch": 2.1483253588516744, "grad_norm": 0.21978471576143496, "learning_rate": 1.578014184397163e-05, "loss": 0.2797, "step": 1347 }, { "epoch": 2.149920255183413, "grad_norm": 0.22822530533482432, "learning_rate": 1.5750591016548462e-05, "loss": 0.2919, "step": 1348 }, { "epoch": 2.1515151515151514, "grad_norm": 0.2044945872441107, "learning_rate": 1.5721040189125296e-05, "loss": 0.2792, "step": 1349 }, { "epoch": 2.15311004784689, "grad_norm": 0.22935867377457256, "learning_rate": 1.5691489361702127e-05, "loss": 0.2925, "step": 1350 }, { "epoch": 2.1547049441786283, "grad_norm": 0.20932432909358972, "learning_rate": 1.566193853427896e-05, "loss": 0.2764, "step": 1351 }, { "epoch": 2.156299840510367, "grad_norm": 0.1905317558226721, "learning_rate": 1.5632387706855792e-05, "loss": 0.2742, "step": 1352 }, { "epoch": 2.1578947368421053, "grad_norm": 0.22400068662503747, "learning_rate": 1.5602836879432626e-05, "loss": 0.2719, "step": 1353 }, { "epoch": 2.159489633173844, "grad_norm": 0.19043507893823616, "learning_rate": 1.5573286052009457e-05, "loss": 0.2687, "step": 1354 }, { "epoch": 2.1610845295055823, "grad_norm": 0.21374620629127433, "learning_rate": 1.554373522458629e-05, "loss": 0.2993, "step": 1355 }, { "epoch": 2.1626794258373208, "grad_norm": 0.21272557256976962, "learning_rate": 1.551418439716312e-05, "loss": 0.282, "step": 1356 }, { "epoch": 2.1642743221690592, "grad_norm": 0.21547482708739815, "learning_rate": 1.5484633569739952e-05, "loss": 0.2994, "step": 1357 }, { "epoch": 2.1658692185007973, "grad_norm": 0.189697439257458, "learning_rate": 1.5455082742316786e-05, "loss": 0.2683, "step": 1358 }, { "epoch": 2.1674641148325358, "grad_norm": 0.21202136792691953, "learning_rate": 1.5425531914893617e-05, "loss": 0.2789, "step": 1359 }, { "epoch": 2.1690590111642742, "grad_norm": 0.21714491118212503, "learning_rate": 1.5395981087470447e-05, "loss": 0.2912, "step": 1360 }, { "epoch": 2.1706539074960127, "grad_norm": 0.20968427552640131, "learning_rate": 1.536643026004728e-05, "loss": 0.2866, "step": 1361 }, { "epoch": 2.172248803827751, "grad_norm": 0.20161618219190705, "learning_rate": 1.5336879432624115e-05, "loss": 0.2845, "step": 1362 }, { "epoch": 2.1738437001594897, "grad_norm": 0.2052042867666781, "learning_rate": 1.5307328605200946e-05, "loss": 0.2821, "step": 1363 }, { "epoch": 2.175438596491228, "grad_norm": 0.1999399159450641, "learning_rate": 1.527777777777778e-05, "loss": 0.2978, "step": 1364 }, { "epoch": 2.1770334928229667, "grad_norm": 0.20008653781476507, "learning_rate": 1.5248226950354611e-05, "loss": 0.2916, "step": 1365 }, { "epoch": 2.178628389154705, "grad_norm": 0.20553956057248662, "learning_rate": 1.5218676122931443e-05, "loss": 0.2766, "step": 1366 }, { "epoch": 2.180223285486443, "grad_norm": 0.1952951774504871, "learning_rate": 1.5189125295508274e-05, "loss": 0.2829, "step": 1367 }, { "epoch": 2.1818181818181817, "grad_norm": 0.1908656396942766, "learning_rate": 1.5159574468085108e-05, "loss": 0.2676, "step": 1368 }, { "epoch": 2.18341307814992, "grad_norm": 0.1881610539436902, "learning_rate": 1.5130023640661939e-05, "loss": 0.2786, "step": 1369 }, { "epoch": 2.1850079744816586, "grad_norm": 0.20180620139068708, "learning_rate": 1.5100472813238773e-05, "loss": 0.3015, "step": 1370 }, { "epoch": 2.186602870813397, "grad_norm": 0.1878395380947163, "learning_rate": 1.5070921985815603e-05, "loss": 0.2815, "step": 1371 }, { "epoch": 2.1881977671451356, "grad_norm": 0.19514778155353688, "learning_rate": 1.5041371158392437e-05, "loss": 0.2784, "step": 1372 }, { "epoch": 2.189792663476874, "grad_norm": 0.20228799672760642, "learning_rate": 1.5011820330969268e-05, "loss": 0.2863, "step": 1373 }, { "epoch": 2.1913875598086126, "grad_norm": 0.1922174723929839, "learning_rate": 1.49822695035461e-05, "loss": 0.2767, "step": 1374 }, { "epoch": 2.192982456140351, "grad_norm": 0.1935768570571481, "learning_rate": 1.4952718676122931e-05, "loss": 0.2765, "step": 1375 }, { "epoch": 2.1945773524720895, "grad_norm": 0.1926658173420166, "learning_rate": 1.4923167848699765e-05, "loss": 0.2881, "step": 1376 }, { "epoch": 2.1961722488038276, "grad_norm": 0.1981319332952795, "learning_rate": 1.4893617021276596e-05, "loss": 0.2835, "step": 1377 }, { "epoch": 2.197767145135566, "grad_norm": 0.19905238931993244, "learning_rate": 1.486406619385343e-05, "loss": 0.2784, "step": 1378 }, { "epoch": 2.1993620414673045, "grad_norm": 0.19557550695272094, "learning_rate": 1.483451536643026e-05, "loss": 0.2657, "step": 1379 }, { "epoch": 2.200956937799043, "grad_norm": 0.21865747454727352, "learning_rate": 1.4804964539007093e-05, "loss": 0.2891, "step": 1380 }, { "epoch": 2.2025518341307815, "grad_norm": 0.1932992066595581, "learning_rate": 1.4775413711583924e-05, "loss": 0.2796, "step": 1381 }, { "epoch": 2.20414673046252, "grad_norm": 0.19801759666548674, "learning_rate": 1.4745862884160758e-05, "loss": 0.2795, "step": 1382 }, { "epoch": 2.2057416267942584, "grad_norm": 0.1978862652711035, "learning_rate": 1.4716312056737588e-05, "loss": 0.2826, "step": 1383 }, { "epoch": 2.207336523125997, "grad_norm": 0.2189909743178743, "learning_rate": 1.4686761229314422e-05, "loss": 0.2951, "step": 1384 }, { "epoch": 2.2089314194577354, "grad_norm": 0.2186157874423699, "learning_rate": 1.4657210401891253e-05, "loss": 0.2888, "step": 1385 }, { "epoch": 2.2105263157894735, "grad_norm": 0.19849015803483708, "learning_rate": 1.4627659574468085e-05, "loss": 0.2781, "step": 1386 }, { "epoch": 2.212121212121212, "grad_norm": 0.20264742817387596, "learning_rate": 1.4598108747044916e-05, "loss": 0.299, "step": 1387 }, { "epoch": 2.2137161084529504, "grad_norm": 0.2039504580786349, "learning_rate": 1.456855791962175e-05, "loss": 0.2677, "step": 1388 }, { "epoch": 2.215311004784689, "grad_norm": 0.18738935626028597, "learning_rate": 1.4539007092198581e-05, "loss": 0.2867, "step": 1389 }, { "epoch": 2.2169059011164274, "grad_norm": 0.1949833166089482, "learning_rate": 1.4509456264775415e-05, "loss": 0.2808, "step": 1390 }, { "epoch": 2.218500797448166, "grad_norm": 0.20513122369351078, "learning_rate": 1.4479905437352246e-05, "loss": 0.2869, "step": 1391 }, { "epoch": 2.2200956937799043, "grad_norm": 0.19316845025143753, "learning_rate": 1.4450354609929078e-05, "loss": 0.2955, "step": 1392 }, { "epoch": 2.221690590111643, "grad_norm": 0.18352747577870804, "learning_rate": 1.4420803782505912e-05, "loss": 0.2806, "step": 1393 }, { "epoch": 2.2232854864433813, "grad_norm": 0.19223924201369896, "learning_rate": 1.4391252955082743e-05, "loss": 0.2955, "step": 1394 }, { "epoch": 2.22488038277512, "grad_norm": 0.19061308610341043, "learning_rate": 1.4361702127659577e-05, "loss": 0.2894, "step": 1395 }, { "epoch": 2.2264752791068583, "grad_norm": 0.18441278921912002, "learning_rate": 1.4332151300236407e-05, "loss": 0.2753, "step": 1396 }, { "epoch": 2.2280701754385963, "grad_norm": 0.18923614781159062, "learning_rate": 1.4302600472813242e-05, "loss": 0.2716, "step": 1397 }, { "epoch": 2.229665071770335, "grad_norm": 0.1982770659131026, "learning_rate": 1.4273049645390072e-05, "loss": 0.2902, "step": 1398 }, { "epoch": 2.2312599681020733, "grad_norm": 0.20584675781164427, "learning_rate": 1.4243498817966905e-05, "loss": 0.2762, "step": 1399 }, { "epoch": 2.2328548644338118, "grad_norm": 0.19218947949479012, "learning_rate": 1.4213947990543735e-05, "loss": 0.2855, "step": 1400 }, { "epoch": 2.2344497607655502, "grad_norm": 0.20209163240755496, "learning_rate": 1.418439716312057e-05, "loss": 0.2887, "step": 1401 }, { "epoch": 2.2360446570972887, "grad_norm": 0.20443051704199405, "learning_rate": 1.41548463356974e-05, "loss": 0.2904, "step": 1402 }, { "epoch": 2.237639553429027, "grad_norm": 0.19224705991841265, "learning_rate": 1.4125295508274234e-05, "loss": 0.2764, "step": 1403 }, { "epoch": 2.2392344497607657, "grad_norm": 0.1952273125371762, "learning_rate": 1.4095744680851065e-05, "loss": 0.2884, "step": 1404 }, { "epoch": 2.240829346092504, "grad_norm": 0.1835798363674109, "learning_rate": 1.4066193853427897e-05, "loss": 0.2804, "step": 1405 }, { "epoch": 2.242424242424242, "grad_norm": 0.2020954846133312, "learning_rate": 1.4036643026004728e-05, "loss": 0.306, "step": 1406 }, { "epoch": 2.2440191387559807, "grad_norm": 0.18743042324615364, "learning_rate": 1.4007092198581562e-05, "loss": 0.2849, "step": 1407 }, { "epoch": 2.245614035087719, "grad_norm": 0.19871136070006631, "learning_rate": 1.3977541371158392e-05, "loss": 0.2709, "step": 1408 }, { "epoch": 2.2472089314194577, "grad_norm": 0.1858298596775106, "learning_rate": 1.3947990543735227e-05, "loss": 0.2881, "step": 1409 }, { "epoch": 2.248803827751196, "grad_norm": 0.20714418441717844, "learning_rate": 1.3918439716312057e-05, "loss": 0.2827, "step": 1410 }, { "epoch": 2.2503987240829346, "grad_norm": 0.19107172286892524, "learning_rate": 1.388888888888889e-05, "loss": 0.2876, "step": 1411 }, { "epoch": 2.251993620414673, "grad_norm": 0.21288436196831753, "learning_rate": 1.385933806146572e-05, "loss": 0.289, "step": 1412 }, { "epoch": 2.2535885167464116, "grad_norm": 0.19338687768643992, "learning_rate": 1.3829787234042554e-05, "loss": 0.2804, "step": 1413 }, { "epoch": 2.25518341307815, "grad_norm": 0.19583569083608332, "learning_rate": 1.3800236406619385e-05, "loss": 0.274, "step": 1414 }, { "epoch": 2.2567783094098885, "grad_norm": 0.20983223932322062, "learning_rate": 1.3770685579196219e-05, "loss": 0.278, "step": 1415 }, { "epoch": 2.258373205741627, "grad_norm": 0.20423141746314272, "learning_rate": 1.374113475177305e-05, "loss": 0.3046, "step": 1416 }, { "epoch": 2.259968102073365, "grad_norm": 0.19744402630366822, "learning_rate": 1.3711583924349882e-05, "loss": 0.2791, "step": 1417 }, { "epoch": 2.2615629984051036, "grad_norm": 0.20291091342453943, "learning_rate": 1.3682033096926713e-05, "loss": 0.2804, "step": 1418 }, { "epoch": 2.263157894736842, "grad_norm": 0.20071450450922498, "learning_rate": 1.3652482269503547e-05, "loss": 0.2862, "step": 1419 }, { "epoch": 2.2647527910685805, "grad_norm": 0.20208467319281168, "learning_rate": 1.3622931442080377e-05, "loss": 0.2854, "step": 1420 }, { "epoch": 2.266347687400319, "grad_norm": 0.19995824517610936, "learning_rate": 1.3593380614657212e-05, "loss": 0.289, "step": 1421 }, { "epoch": 2.2679425837320575, "grad_norm": 0.2077220545177424, "learning_rate": 1.3563829787234042e-05, "loss": 0.2819, "step": 1422 }, { "epoch": 2.269537480063796, "grad_norm": 0.18662533529179257, "learning_rate": 1.3534278959810876e-05, "loss": 0.2668, "step": 1423 }, { "epoch": 2.2711323763955344, "grad_norm": 0.20192013366027087, "learning_rate": 1.3504728132387707e-05, "loss": 0.278, "step": 1424 }, { "epoch": 2.2727272727272725, "grad_norm": 0.2062007041017961, "learning_rate": 1.347517730496454e-05, "loss": 0.2985, "step": 1425 }, { "epoch": 2.274322169059011, "grad_norm": 0.19808076187440443, "learning_rate": 1.3445626477541373e-05, "loss": 0.297, "step": 1426 }, { "epoch": 2.2759170653907494, "grad_norm": 0.1960515801794439, "learning_rate": 1.3416075650118204e-05, "loss": 0.295, "step": 1427 }, { "epoch": 2.277511961722488, "grad_norm": 0.2031095578781459, "learning_rate": 1.3386524822695038e-05, "loss": 0.292, "step": 1428 }, { "epoch": 2.2791068580542264, "grad_norm": 0.188034008471154, "learning_rate": 1.3356973995271869e-05, "loss": 0.2771, "step": 1429 }, { "epoch": 2.280701754385965, "grad_norm": 0.19893426576660114, "learning_rate": 1.3327423167848701e-05, "loss": 0.3034, "step": 1430 }, { "epoch": 2.2822966507177034, "grad_norm": 0.19046453651407427, "learning_rate": 1.3297872340425532e-05, "loss": 0.2794, "step": 1431 }, { "epoch": 2.283891547049442, "grad_norm": 0.20390242354410631, "learning_rate": 1.3268321513002366e-05, "loss": 0.2954, "step": 1432 }, { "epoch": 2.2854864433811803, "grad_norm": 0.18884346423349735, "learning_rate": 1.3238770685579197e-05, "loss": 0.2858, "step": 1433 }, { "epoch": 2.287081339712919, "grad_norm": 0.19557846194668274, "learning_rate": 1.320921985815603e-05, "loss": 0.2823, "step": 1434 }, { "epoch": 2.2886762360446573, "grad_norm": 0.20247579610523028, "learning_rate": 1.3179669030732861e-05, "loss": 0.2846, "step": 1435 }, { "epoch": 2.2902711323763953, "grad_norm": 0.2016161922808375, "learning_rate": 1.3150118203309694e-05, "loss": 0.2784, "step": 1436 }, { "epoch": 2.291866028708134, "grad_norm": 0.19980998414335302, "learning_rate": 1.3120567375886524e-05, "loss": 0.2873, "step": 1437 }, { "epoch": 2.2934609250398723, "grad_norm": 0.19923441810298617, "learning_rate": 1.3091016548463358e-05, "loss": 0.2913, "step": 1438 }, { "epoch": 2.295055821371611, "grad_norm": 0.2071975867494998, "learning_rate": 1.3061465721040189e-05, "loss": 0.2891, "step": 1439 }, { "epoch": 2.2966507177033493, "grad_norm": 0.19819326066968188, "learning_rate": 1.3031914893617023e-05, "loss": 0.2794, "step": 1440 }, { "epoch": 2.2982456140350878, "grad_norm": 0.195451324303665, "learning_rate": 1.3002364066193854e-05, "loss": 0.2993, "step": 1441 }, { "epoch": 2.2998405103668262, "grad_norm": 0.20245173313875517, "learning_rate": 1.2972813238770686e-05, "loss": 0.2837, "step": 1442 }, { "epoch": 2.3014354066985647, "grad_norm": 0.1989454586337052, "learning_rate": 1.2943262411347517e-05, "loss": 0.2719, "step": 1443 }, { "epoch": 2.303030303030303, "grad_norm": 0.2065819441054297, "learning_rate": 1.2913711583924351e-05, "loss": 0.2905, "step": 1444 }, { "epoch": 2.3046251993620412, "grad_norm": 0.1831519846891132, "learning_rate": 1.2884160756501182e-05, "loss": 0.2703, "step": 1445 }, { "epoch": 2.3062200956937797, "grad_norm": 0.2100150745675072, "learning_rate": 1.2854609929078016e-05, "loss": 0.2911, "step": 1446 }, { "epoch": 2.307814992025518, "grad_norm": 0.19847979283761502, "learning_rate": 1.2825059101654846e-05, "loss": 0.2748, "step": 1447 }, { "epoch": 2.3094098883572567, "grad_norm": 0.1914695726421332, "learning_rate": 1.279550827423168e-05, "loss": 0.279, "step": 1448 }, { "epoch": 2.311004784688995, "grad_norm": 0.18002641177579845, "learning_rate": 1.2765957446808511e-05, "loss": 0.2769, "step": 1449 }, { "epoch": 2.3125996810207337, "grad_norm": 0.20262979949090446, "learning_rate": 1.2736406619385343e-05, "loss": 0.2848, "step": 1450 }, { "epoch": 2.314194577352472, "grad_norm": 0.21386372821479158, "learning_rate": 1.2706855791962174e-05, "loss": 0.2695, "step": 1451 }, { "epoch": 2.3157894736842106, "grad_norm": 0.18782057808952057, "learning_rate": 1.2677304964539008e-05, "loss": 0.2727, "step": 1452 }, { "epoch": 2.317384370015949, "grad_norm": 0.2022340430933124, "learning_rate": 1.2647754137115839e-05, "loss": 0.2882, "step": 1453 }, { "epoch": 2.3189792663476876, "grad_norm": 0.1931491212267657, "learning_rate": 1.2618203309692673e-05, "loss": 0.2687, "step": 1454 }, { "epoch": 2.320574162679426, "grad_norm": 0.2051691845307065, "learning_rate": 1.2588652482269504e-05, "loss": 0.2938, "step": 1455 }, { "epoch": 2.322169059011164, "grad_norm": 0.19455037422833552, "learning_rate": 1.2559101654846336e-05, "loss": 0.275, "step": 1456 }, { "epoch": 2.3237639553429026, "grad_norm": 0.19282845734668222, "learning_rate": 1.2529550827423167e-05, "loss": 0.2736, "step": 1457 }, { "epoch": 2.325358851674641, "grad_norm": 0.1959108730501667, "learning_rate": 1.25e-05, "loss": 0.2801, "step": 1458 }, { "epoch": 2.3269537480063796, "grad_norm": 0.19404161500257236, "learning_rate": 1.2470449172576833e-05, "loss": 0.2842, "step": 1459 }, { "epoch": 2.328548644338118, "grad_norm": 0.19734492077850768, "learning_rate": 1.2440898345153665e-05, "loss": 0.286, "step": 1460 }, { "epoch": 2.3301435406698565, "grad_norm": 0.18959279335721826, "learning_rate": 1.2411347517730498e-05, "loss": 0.283, "step": 1461 }, { "epoch": 2.331738437001595, "grad_norm": 0.2046592875434337, "learning_rate": 1.2381796690307328e-05, "loss": 0.2939, "step": 1462 }, { "epoch": 2.3333333333333335, "grad_norm": 0.17789541241823756, "learning_rate": 1.235224586288416e-05, "loss": 0.2789, "step": 1463 }, { "epoch": 2.334928229665072, "grad_norm": 0.19163505436331796, "learning_rate": 1.2322695035460993e-05, "loss": 0.3175, "step": 1464 }, { "epoch": 2.33652312599681, "grad_norm": 0.20491566988774973, "learning_rate": 1.2293144208037825e-05, "loss": 0.2846, "step": 1465 }, { "epoch": 2.3381180223285485, "grad_norm": 0.1965329371966845, "learning_rate": 1.2263593380614658e-05, "loss": 0.2758, "step": 1466 }, { "epoch": 2.339712918660287, "grad_norm": 0.19344861333345595, "learning_rate": 1.223404255319149e-05, "loss": 0.299, "step": 1467 }, { "epoch": 2.3413078149920254, "grad_norm": 0.18858201946113082, "learning_rate": 1.2204491725768321e-05, "loss": 0.2806, "step": 1468 }, { "epoch": 2.342902711323764, "grad_norm": 0.21429531891938958, "learning_rate": 1.2174940898345153e-05, "loss": 0.2912, "step": 1469 }, { "epoch": 2.3444976076555024, "grad_norm": 0.19690451547712964, "learning_rate": 1.2145390070921986e-05, "loss": 0.2828, "step": 1470 }, { "epoch": 2.346092503987241, "grad_norm": 0.19631069367050102, "learning_rate": 1.2115839243498818e-05, "loss": 0.2928, "step": 1471 }, { "epoch": 2.3476874003189794, "grad_norm": 0.19751839515619654, "learning_rate": 1.208628841607565e-05, "loss": 0.2808, "step": 1472 }, { "epoch": 2.349282296650718, "grad_norm": 0.1904312202786101, "learning_rate": 1.2056737588652483e-05, "loss": 0.2826, "step": 1473 }, { "epoch": 2.3508771929824563, "grad_norm": 0.20730845086770433, "learning_rate": 1.2027186761229315e-05, "loss": 0.2936, "step": 1474 }, { "epoch": 2.352472089314195, "grad_norm": 0.20987961761682486, "learning_rate": 1.1997635933806147e-05, "loss": 0.2962, "step": 1475 }, { "epoch": 2.354066985645933, "grad_norm": 0.19588466323202985, "learning_rate": 1.196808510638298e-05, "loss": 0.2755, "step": 1476 }, { "epoch": 2.3556618819776713, "grad_norm": 0.19241814016689676, "learning_rate": 1.1938534278959812e-05, "loss": 0.2778, "step": 1477 }, { "epoch": 2.35725677830941, "grad_norm": 0.19700782774248995, "learning_rate": 1.1908983451536645e-05, "loss": 0.2896, "step": 1478 }, { "epoch": 2.3588516746411483, "grad_norm": 0.19195611332452558, "learning_rate": 1.1879432624113477e-05, "loss": 0.2755, "step": 1479 }, { "epoch": 2.360446570972887, "grad_norm": 0.19111599681736302, "learning_rate": 1.1849881796690308e-05, "loss": 0.2806, "step": 1480 }, { "epoch": 2.3620414673046253, "grad_norm": 0.20788661589896487, "learning_rate": 1.182033096926714e-05, "loss": 0.2956, "step": 1481 }, { "epoch": 2.3636363636363638, "grad_norm": 0.18816303943032542, "learning_rate": 1.1790780141843972e-05, "loss": 0.2892, "step": 1482 }, { "epoch": 2.3652312599681022, "grad_norm": 0.1993155314625537, "learning_rate": 1.1761229314420805e-05, "loss": 0.2875, "step": 1483 }, { "epoch": 2.3668261562998403, "grad_norm": 0.2068906691977991, "learning_rate": 1.1731678486997637e-05, "loss": 0.2854, "step": 1484 }, { "epoch": 2.3684210526315788, "grad_norm": 0.20100892410346535, "learning_rate": 1.170212765957447e-05, "loss": 0.2729, "step": 1485 }, { "epoch": 2.3700159489633172, "grad_norm": 0.20177107586000337, "learning_rate": 1.1672576832151302e-05, "loss": 0.2892, "step": 1486 }, { "epoch": 2.3716108452950557, "grad_norm": 0.2044435164207284, "learning_rate": 1.1643026004728132e-05, "loss": 0.2933, "step": 1487 }, { "epoch": 2.373205741626794, "grad_norm": 0.20012703788779443, "learning_rate": 1.1613475177304965e-05, "loss": 0.2856, "step": 1488 }, { "epoch": 2.3748006379585327, "grad_norm": 0.2034987353486098, "learning_rate": 1.1583924349881797e-05, "loss": 0.2816, "step": 1489 }, { "epoch": 2.376395534290271, "grad_norm": 0.1937186929564102, "learning_rate": 1.155437352245863e-05, "loss": 0.2815, "step": 1490 }, { "epoch": 2.3779904306220097, "grad_norm": 0.20651216810946918, "learning_rate": 1.1524822695035462e-05, "loss": 0.296, "step": 1491 }, { "epoch": 2.379585326953748, "grad_norm": 0.19753393388723442, "learning_rate": 1.1495271867612294e-05, "loss": 0.2794, "step": 1492 }, { "epoch": 2.3811802232854866, "grad_norm": 0.20174514778275207, "learning_rate": 1.1465721040189125e-05, "loss": 0.2956, "step": 1493 }, { "epoch": 2.382775119617225, "grad_norm": 0.2001022121909817, "learning_rate": 1.1436170212765957e-05, "loss": 0.2844, "step": 1494 }, { "epoch": 2.384370015948963, "grad_norm": 0.19632779597410158, "learning_rate": 1.140661938534279e-05, "loss": 0.2883, "step": 1495 }, { "epoch": 2.3859649122807016, "grad_norm": 0.1811652520175169, "learning_rate": 1.1377068557919622e-05, "loss": 0.2874, "step": 1496 }, { "epoch": 2.38755980861244, "grad_norm": 0.19037096928544453, "learning_rate": 1.1347517730496454e-05, "loss": 0.2778, "step": 1497 }, { "epoch": 2.3891547049441786, "grad_norm": 0.1983304082315547, "learning_rate": 1.1317966903073287e-05, "loss": 0.2979, "step": 1498 }, { "epoch": 2.390749601275917, "grad_norm": 0.19089695935995762, "learning_rate": 1.1288416075650119e-05, "loss": 0.27, "step": 1499 }, { "epoch": 2.3923444976076556, "grad_norm": 0.19557845923968656, "learning_rate": 1.125886524822695e-05, "loss": 0.2803, "step": 1500 }, { "epoch": 2.393939393939394, "grad_norm": 0.18741268923229684, "learning_rate": 1.1229314420803782e-05, "loss": 0.274, "step": 1501 }, { "epoch": 2.3955342902711325, "grad_norm": 0.18621472224678465, "learning_rate": 1.1199763593380615e-05, "loss": 0.2944, "step": 1502 }, { "epoch": 2.397129186602871, "grad_norm": 0.20408399844040342, "learning_rate": 1.1170212765957447e-05, "loss": 0.2704, "step": 1503 }, { "epoch": 2.398724082934609, "grad_norm": 0.19239290679294213, "learning_rate": 1.114066193853428e-05, "loss": 0.2845, "step": 1504 }, { "epoch": 2.4003189792663475, "grad_norm": 0.18871300512883105, "learning_rate": 1.1111111111111112e-05, "loss": 0.283, "step": 1505 }, { "epoch": 2.401913875598086, "grad_norm": 0.19186691955878366, "learning_rate": 1.1081560283687942e-05, "loss": 0.2927, "step": 1506 }, { "epoch": 2.4035087719298245, "grad_norm": 0.18175033560754678, "learning_rate": 1.1052009456264776e-05, "loss": 0.2788, "step": 1507 }, { "epoch": 2.405103668261563, "grad_norm": 0.20212137417914952, "learning_rate": 1.1022458628841609e-05, "loss": 0.2854, "step": 1508 }, { "epoch": 2.4066985645933014, "grad_norm": 0.1944529221931983, "learning_rate": 1.0992907801418441e-05, "loss": 0.285, "step": 1509 }, { "epoch": 2.40829346092504, "grad_norm": 0.18654026327861178, "learning_rate": 1.0963356973995273e-05, "loss": 0.291, "step": 1510 }, { "epoch": 2.4098883572567784, "grad_norm": 0.18490106187594824, "learning_rate": 1.0933806146572106e-05, "loss": 0.2889, "step": 1511 }, { "epoch": 2.411483253588517, "grad_norm": 0.1956056437206659, "learning_rate": 1.0904255319148937e-05, "loss": 0.2942, "step": 1512 }, { "epoch": 2.4130781499202554, "grad_norm": 0.19288654810009526, "learning_rate": 1.0874704491725769e-05, "loss": 0.2739, "step": 1513 }, { "epoch": 2.414673046251994, "grad_norm": 0.19121616527714433, "learning_rate": 1.0845153664302601e-05, "loss": 0.2748, "step": 1514 }, { "epoch": 2.416267942583732, "grad_norm": 0.1915729310331162, "learning_rate": 1.0815602836879434e-05, "loss": 0.2877, "step": 1515 }, { "epoch": 2.4178628389154704, "grad_norm": 0.21373610057344555, "learning_rate": 1.0786052009456266e-05, "loss": 0.2706, "step": 1516 }, { "epoch": 2.419457735247209, "grad_norm": 0.20538930133963962, "learning_rate": 1.0756501182033098e-05, "loss": 0.2843, "step": 1517 }, { "epoch": 2.4210526315789473, "grad_norm": 0.19068833403484553, "learning_rate": 1.0726950354609929e-05, "loss": 0.285, "step": 1518 }, { "epoch": 2.422647527910686, "grad_norm": 0.19690251006466564, "learning_rate": 1.0697399527186761e-05, "loss": 0.2831, "step": 1519 }, { "epoch": 2.4242424242424243, "grad_norm": 0.18425867487877104, "learning_rate": 1.0667848699763594e-05, "loss": 0.2708, "step": 1520 }, { "epoch": 2.425837320574163, "grad_norm": 0.198924828040544, "learning_rate": 1.0638297872340426e-05, "loss": 0.2765, "step": 1521 }, { "epoch": 2.4274322169059013, "grad_norm": 0.1985763836727414, "learning_rate": 1.0608747044917258e-05, "loss": 0.2833, "step": 1522 }, { "epoch": 2.4290271132376393, "grad_norm": 0.19072257799717535, "learning_rate": 1.057919621749409e-05, "loss": 0.2965, "step": 1523 }, { "epoch": 2.430622009569378, "grad_norm": 0.190896009018634, "learning_rate": 1.0549645390070923e-05, "loss": 0.2925, "step": 1524 }, { "epoch": 2.4322169059011163, "grad_norm": 0.19306357985767586, "learning_rate": 1.0520094562647754e-05, "loss": 0.2817, "step": 1525 }, { "epoch": 2.4338118022328548, "grad_norm": 0.19802791625687227, "learning_rate": 1.0490543735224586e-05, "loss": 0.2811, "step": 1526 }, { "epoch": 2.4354066985645932, "grad_norm": 0.18673944821983854, "learning_rate": 1.0460992907801419e-05, "loss": 0.2717, "step": 1527 }, { "epoch": 2.4370015948963317, "grad_norm": 0.18982012865008374, "learning_rate": 1.0431442080378251e-05, "loss": 0.2908, "step": 1528 }, { "epoch": 2.43859649122807, "grad_norm": 0.18987392183919383, "learning_rate": 1.0401891252955083e-05, "loss": 0.2924, "step": 1529 }, { "epoch": 2.4401913875598087, "grad_norm": 0.20296136199875175, "learning_rate": 1.0372340425531916e-05, "loss": 0.2896, "step": 1530 }, { "epoch": 2.441786283891547, "grad_norm": 0.1883135365481678, "learning_rate": 1.0342789598108746e-05, "loss": 0.295, "step": 1531 }, { "epoch": 2.4433811802232857, "grad_norm": 0.20570713529105752, "learning_rate": 1.0313238770685579e-05, "loss": 0.2899, "step": 1532 }, { "epoch": 2.444976076555024, "grad_norm": 0.1840844736441399, "learning_rate": 1.0283687943262411e-05, "loss": 0.2873, "step": 1533 }, { "epoch": 2.446570972886762, "grad_norm": 0.18902507699303328, "learning_rate": 1.0254137115839243e-05, "loss": 0.3035, "step": 1534 }, { "epoch": 2.4481658692185007, "grad_norm": 0.22533236010690505, "learning_rate": 1.0224586288416076e-05, "loss": 0.2913, "step": 1535 }, { "epoch": 2.449760765550239, "grad_norm": 0.20065025841626785, "learning_rate": 1.0195035460992908e-05, "loss": 0.2812, "step": 1536 }, { "epoch": 2.4513556618819776, "grad_norm": 0.18641293107852303, "learning_rate": 1.016548463356974e-05, "loss": 0.2823, "step": 1537 }, { "epoch": 2.452950558213716, "grad_norm": 0.20220811438536426, "learning_rate": 1.0135933806146571e-05, "loss": 0.2902, "step": 1538 }, { "epoch": 2.4545454545454546, "grad_norm": 0.19705248878806375, "learning_rate": 1.0106382978723404e-05, "loss": 0.2841, "step": 1539 }, { "epoch": 2.456140350877193, "grad_norm": 0.22705606557097777, "learning_rate": 1.0076832151300238e-05, "loss": 0.2962, "step": 1540 }, { "epoch": 2.4577352472089316, "grad_norm": 0.19492893346554402, "learning_rate": 1.004728132387707e-05, "loss": 0.2944, "step": 1541 }, { "epoch": 2.45933014354067, "grad_norm": 0.18934586067304182, "learning_rate": 1.0017730496453902e-05, "loss": 0.2845, "step": 1542 }, { "epoch": 2.460925039872408, "grad_norm": 0.22260029318033508, "learning_rate": 9.988179669030733e-06, "loss": 0.2811, "step": 1543 }, { "epoch": 2.4625199362041466, "grad_norm": 0.1950902917296635, "learning_rate": 9.958628841607565e-06, "loss": 0.2855, "step": 1544 }, { "epoch": 2.464114832535885, "grad_norm": 0.2040774328919602, "learning_rate": 9.929078014184398e-06, "loss": 0.2775, "step": 1545 }, { "epoch": 2.4657097288676235, "grad_norm": 0.19547964526118985, "learning_rate": 9.89952718676123e-06, "loss": 0.2892, "step": 1546 }, { "epoch": 2.467304625199362, "grad_norm": 0.1993911430904888, "learning_rate": 9.869976359338063e-06, "loss": 0.2724, "step": 1547 }, { "epoch": 2.4688995215311005, "grad_norm": 0.20297384308172234, "learning_rate": 9.840425531914895e-06, "loss": 0.2735, "step": 1548 }, { "epoch": 2.470494417862839, "grad_norm": 0.2075388648580978, "learning_rate": 9.810874704491727e-06, "loss": 0.2846, "step": 1549 }, { "epoch": 2.4720893141945774, "grad_norm": 0.1865142751897726, "learning_rate": 9.781323877068558e-06, "loss": 0.2788, "step": 1550 }, { "epoch": 2.473684210526316, "grad_norm": 0.19814378044526623, "learning_rate": 9.75177304964539e-06, "loss": 0.2824, "step": 1551 }, { "epoch": 2.4752791068580544, "grad_norm": 0.19265947495554403, "learning_rate": 9.722222222222223e-06, "loss": 0.283, "step": 1552 }, { "epoch": 2.476874003189793, "grad_norm": 0.18311584758394678, "learning_rate": 9.692671394799055e-06, "loss": 0.2779, "step": 1553 }, { "epoch": 2.478468899521531, "grad_norm": 0.2059760169038871, "learning_rate": 9.663120567375887e-06, "loss": 0.2952, "step": 1554 }, { "epoch": 2.4800637958532694, "grad_norm": 0.19180257141218765, "learning_rate": 9.63356973995272e-06, "loss": 0.2856, "step": 1555 }, { "epoch": 2.481658692185008, "grad_norm": 0.18558096256204012, "learning_rate": 9.60401891252955e-06, "loss": 0.2923, "step": 1556 }, { "epoch": 2.4832535885167464, "grad_norm": 0.19573342773091482, "learning_rate": 9.574468085106383e-06, "loss": 0.299, "step": 1557 }, { "epoch": 2.484848484848485, "grad_norm": 0.18953627815124202, "learning_rate": 9.544917257683215e-06, "loss": 0.2727, "step": 1558 }, { "epoch": 2.4864433811802233, "grad_norm": 0.18352841708745943, "learning_rate": 9.515366430260048e-06, "loss": 0.286, "step": 1559 }, { "epoch": 2.488038277511962, "grad_norm": 0.19836615608377983, "learning_rate": 9.48581560283688e-06, "loss": 0.3015, "step": 1560 }, { "epoch": 2.4896331738437003, "grad_norm": 0.19815358318069923, "learning_rate": 9.456264775413712e-06, "loss": 0.2817, "step": 1561 }, { "epoch": 2.4912280701754383, "grad_norm": 0.19715172993722144, "learning_rate": 9.426713947990545e-06, "loss": 0.2908, "step": 1562 }, { "epoch": 2.492822966507177, "grad_norm": 0.2047881044112122, "learning_rate": 9.397163120567375e-06, "loss": 0.2839, "step": 1563 }, { "epoch": 2.4944178628389153, "grad_norm": 0.1742336806038457, "learning_rate": 9.367612293144208e-06, "loss": 0.2705, "step": 1564 }, { "epoch": 2.496012759170654, "grad_norm": 0.18607491634707524, "learning_rate": 9.33806146572104e-06, "loss": 0.2756, "step": 1565 }, { "epoch": 2.4976076555023923, "grad_norm": 0.18695068681300805, "learning_rate": 9.308510638297872e-06, "loss": 0.2823, "step": 1566 }, { "epoch": 2.4992025518341308, "grad_norm": 0.18789759324877334, "learning_rate": 9.278959810874705e-06, "loss": 0.2809, "step": 1567 }, { "epoch": 2.5007974481658692, "grad_norm": 0.20234363064068553, "learning_rate": 9.249408983451537e-06, "loss": 0.2965, "step": 1568 }, { "epoch": 2.5023923444976077, "grad_norm": 0.19754389520922236, "learning_rate": 9.219858156028368e-06, "loss": 0.2886, "step": 1569 }, { "epoch": 2.503987240829346, "grad_norm": 0.1956600172051185, "learning_rate": 9.1903073286052e-06, "loss": 0.284, "step": 1570 }, { "epoch": 2.5055821371610847, "grad_norm": 0.1795016433636832, "learning_rate": 9.160756501182033e-06, "loss": 0.2708, "step": 1571 }, { "epoch": 2.507177033492823, "grad_norm": 0.1807129324306049, "learning_rate": 9.131205673758867e-06, "loss": 0.2814, "step": 1572 }, { "epoch": 2.5087719298245617, "grad_norm": 0.1974063795203736, "learning_rate": 9.101654846335699e-06, "loss": 0.3016, "step": 1573 }, { "epoch": 2.5103668261562997, "grad_norm": 0.19001998668573047, "learning_rate": 9.072104018912531e-06, "loss": 0.2939, "step": 1574 }, { "epoch": 2.511961722488038, "grad_norm": 0.1850607437603953, "learning_rate": 9.042553191489362e-06, "loss": 0.2809, "step": 1575 }, { "epoch": 2.5135566188197767, "grad_norm": 0.17944250661444366, "learning_rate": 9.013002364066194e-06, "loss": 0.2853, "step": 1576 }, { "epoch": 2.515151515151515, "grad_norm": 0.17753066745865356, "learning_rate": 8.983451536643027e-06, "loss": 0.2815, "step": 1577 }, { "epoch": 2.5167464114832536, "grad_norm": 0.1828587414881581, "learning_rate": 8.953900709219859e-06, "loss": 0.2794, "step": 1578 }, { "epoch": 2.518341307814992, "grad_norm": 0.19250517882989604, "learning_rate": 8.924349881796691e-06, "loss": 0.286, "step": 1579 }, { "epoch": 2.5199362041467306, "grad_norm": 0.19090429362337388, "learning_rate": 8.894799054373524e-06, "loss": 0.277, "step": 1580 }, { "epoch": 2.5215311004784686, "grad_norm": 0.18931473325898293, "learning_rate": 8.865248226950355e-06, "loss": 0.2993, "step": 1581 }, { "epoch": 2.523125996810207, "grad_norm": 0.19282542391678295, "learning_rate": 8.835697399527187e-06, "loss": 0.2899, "step": 1582 }, { "epoch": 2.5247208931419456, "grad_norm": 0.18778000720903418, "learning_rate": 8.80614657210402e-06, "loss": 0.2781, "step": 1583 }, { "epoch": 2.526315789473684, "grad_norm": 0.1811835221234146, "learning_rate": 8.776595744680852e-06, "loss": 0.282, "step": 1584 }, { "epoch": 2.5279106858054226, "grad_norm": 0.18797641535740667, "learning_rate": 8.747044917257684e-06, "loss": 0.287, "step": 1585 }, { "epoch": 2.529505582137161, "grad_norm": 0.18347263295228575, "learning_rate": 8.717494089834516e-06, "loss": 0.2812, "step": 1586 }, { "epoch": 2.5311004784688995, "grad_norm": 0.18379235858940401, "learning_rate": 8.687943262411349e-06, "loss": 0.29, "step": 1587 }, { "epoch": 2.532695374800638, "grad_norm": 0.1931016277522652, "learning_rate": 8.65839243498818e-06, "loss": 0.2736, "step": 1588 }, { "epoch": 2.5342902711323765, "grad_norm": 0.18205996036016112, "learning_rate": 8.628841607565012e-06, "loss": 0.2692, "step": 1589 }, { "epoch": 2.535885167464115, "grad_norm": 0.17454427811581036, "learning_rate": 8.599290780141844e-06, "loss": 0.2691, "step": 1590 }, { "epoch": 2.5374800637958534, "grad_norm": 0.18252877821574, "learning_rate": 8.569739952718676e-06, "loss": 0.2809, "step": 1591 }, { "epoch": 2.539074960127592, "grad_norm": 0.19908376700760544, "learning_rate": 8.540189125295509e-06, "loss": 0.2904, "step": 1592 }, { "epoch": 2.5406698564593304, "grad_norm": 0.19716334308681502, "learning_rate": 8.510638297872341e-06, "loss": 0.2873, "step": 1593 }, { "epoch": 2.5422647527910684, "grad_norm": 0.19440645423906844, "learning_rate": 8.481087470449172e-06, "loss": 0.2836, "step": 1594 }, { "epoch": 2.543859649122807, "grad_norm": 0.18545581409523845, "learning_rate": 8.451536643026004e-06, "loss": 0.2854, "step": 1595 }, { "epoch": 2.5454545454545454, "grad_norm": 0.18918391693605466, "learning_rate": 8.421985815602837e-06, "loss": 0.293, "step": 1596 }, { "epoch": 2.547049441786284, "grad_norm": 0.18906132029478429, "learning_rate": 8.392434988179669e-06, "loss": 0.2668, "step": 1597 }, { "epoch": 2.5486443381180224, "grad_norm": 0.18385209471536998, "learning_rate": 8.362884160756501e-06, "loss": 0.2784, "step": 1598 }, { "epoch": 2.550239234449761, "grad_norm": 0.18349637680801525, "learning_rate": 8.333333333333334e-06, "loss": 0.2815, "step": 1599 }, { "epoch": 2.5518341307814993, "grad_norm": 0.1869380745357958, "learning_rate": 8.303782505910166e-06, "loss": 0.2812, "step": 1600 }, { "epoch": 2.5534290271132374, "grad_norm": 0.18569799481336957, "learning_rate": 8.274231678486997e-06, "loss": 0.2828, "step": 1601 }, { "epoch": 2.555023923444976, "grad_norm": 0.19778679157054505, "learning_rate": 8.244680851063829e-06, "loss": 0.2763, "step": 1602 }, { "epoch": 2.5566188197767143, "grad_norm": 0.20087950389755502, "learning_rate": 8.215130023640662e-06, "loss": 0.2955, "step": 1603 }, { "epoch": 2.558213716108453, "grad_norm": 0.18714852733716705, "learning_rate": 8.185579196217494e-06, "loss": 0.2936, "step": 1604 }, { "epoch": 2.5598086124401913, "grad_norm": 0.1912170418104873, "learning_rate": 8.156028368794328e-06, "loss": 0.2944, "step": 1605 }, { "epoch": 2.56140350877193, "grad_norm": 0.20302378936886878, "learning_rate": 8.126477541371159e-06, "loss": 0.282, "step": 1606 }, { "epoch": 2.5629984051036683, "grad_norm": 0.19222560733801272, "learning_rate": 8.096926713947991e-06, "loss": 0.2894, "step": 1607 }, { "epoch": 2.5645933014354068, "grad_norm": 0.18459187965468396, "learning_rate": 8.067375886524823e-06, "loss": 0.2924, "step": 1608 }, { "epoch": 2.5661881977671452, "grad_norm": 0.18600550546477626, "learning_rate": 8.037825059101656e-06, "loss": 0.2801, "step": 1609 }, { "epoch": 2.5677830940988837, "grad_norm": 0.1924764247901736, "learning_rate": 8.008274231678488e-06, "loss": 0.2839, "step": 1610 }, { "epoch": 2.569377990430622, "grad_norm": 0.1961943736418948, "learning_rate": 7.97872340425532e-06, "loss": 0.2791, "step": 1611 }, { "epoch": 2.5709728867623607, "grad_norm": 0.201448522527737, "learning_rate": 7.949172576832153e-06, "loss": 0.2848, "step": 1612 }, { "epoch": 2.5725677830940987, "grad_norm": 0.1836031665524547, "learning_rate": 7.919621749408983e-06, "loss": 0.2962, "step": 1613 }, { "epoch": 2.574162679425837, "grad_norm": 0.1874851023005661, "learning_rate": 7.890070921985816e-06, "loss": 0.3064, "step": 1614 }, { "epoch": 2.5757575757575757, "grad_norm": 0.19177212726895082, "learning_rate": 7.860520094562648e-06, "loss": 0.2914, "step": 1615 }, { "epoch": 2.577352472089314, "grad_norm": 0.1889514920721239, "learning_rate": 7.83096926713948e-06, "loss": 0.294, "step": 1616 }, { "epoch": 2.5789473684210527, "grad_norm": 0.20072453892850547, "learning_rate": 7.801418439716313e-06, "loss": 0.2897, "step": 1617 }, { "epoch": 2.580542264752791, "grad_norm": 0.19772628331442132, "learning_rate": 7.771867612293145e-06, "loss": 0.2951, "step": 1618 }, { "epoch": 2.5821371610845296, "grad_norm": 0.18317250374840166, "learning_rate": 7.742316784869976e-06, "loss": 0.2807, "step": 1619 }, { "epoch": 2.583732057416268, "grad_norm": 0.18663880533990573, "learning_rate": 7.712765957446808e-06, "loss": 0.2735, "step": 1620 }, { "epoch": 2.585326953748006, "grad_norm": 0.19740471846146276, "learning_rate": 7.68321513002364e-06, "loss": 0.3, "step": 1621 }, { "epoch": 2.5869218500797446, "grad_norm": 0.19277286203369193, "learning_rate": 7.653664302600473e-06, "loss": 0.2992, "step": 1622 }, { "epoch": 2.588516746411483, "grad_norm": 0.18393762505895114, "learning_rate": 7.6241134751773054e-06, "loss": 0.2822, "step": 1623 }, { "epoch": 2.5901116427432216, "grad_norm": 0.2031551704423779, "learning_rate": 7.594562647754137e-06, "loss": 0.2789, "step": 1624 }, { "epoch": 2.59170653907496, "grad_norm": 0.18596601261283405, "learning_rate": 7.565011820330969e-06, "loss": 0.279, "step": 1625 }, { "epoch": 2.5933014354066986, "grad_norm": 0.17734962377638386, "learning_rate": 7.535460992907802e-06, "loss": 0.2787, "step": 1626 }, { "epoch": 2.594896331738437, "grad_norm": 0.1756105884820681, "learning_rate": 7.505910165484634e-06, "loss": 0.2643, "step": 1627 }, { "epoch": 2.5964912280701755, "grad_norm": 0.18937925308847342, "learning_rate": 7.4763593380614656e-06, "loss": 0.2817, "step": 1628 }, { "epoch": 2.598086124401914, "grad_norm": 0.22587981429490991, "learning_rate": 7.446808510638298e-06, "loss": 0.2701, "step": 1629 }, { "epoch": 2.5996810207336525, "grad_norm": 0.19166428565311763, "learning_rate": 7.41725768321513e-06, "loss": 0.2972, "step": 1630 }, { "epoch": 2.601275917065391, "grad_norm": 0.20012005867494403, "learning_rate": 7.387706855791962e-06, "loss": 0.2843, "step": 1631 }, { "epoch": 2.6028708133971294, "grad_norm": 0.18715412005670562, "learning_rate": 7.358156028368794e-06, "loss": 0.2866, "step": 1632 }, { "epoch": 2.6044657097288675, "grad_norm": 0.18776760749027493, "learning_rate": 7.3286052009456266e-06, "loss": 0.3004, "step": 1633 }, { "epoch": 2.606060606060606, "grad_norm": 0.18123317223486105, "learning_rate": 7.299054373522458e-06, "loss": 0.2893, "step": 1634 }, { "epoch": 2.6076555023923444, "grad_norm": 0.191055901639879, "learning_rate": 7.2695035460992904e-06, "loss": 0.3007, "step": 1635 }, { "epoch": 2.609250398724083, "grad_norm": 0.17968835088768598, "learning_rate": 7.239952718676123e-06, "loss": 0.2766, "step": 1636 }, { "epoch": 2.6108452950558214, "grad_norm": 0.18512120300038518, "learning_rate": 7.210401891252956e-06, "loss": 0.2706, "step": 1637 }, { "epoch": 2.61244019138756, "grad_norm": 0.18531488022148296, "learning_rate": 7.180851063829788e-06, "loss": 0.2701, "step": 1638 }, { "epoch": 2.6140350877192984, "grad_norm": 0.185397866546925, "learning_rate": 7.151300236406621e-06, "loss": 0.2898, "step": 1639 }, { "epoch": 2.6156299840510364, "grad_norm": 0.19027842188353375, "learning_rate": 7.121749408983452e-06, "loss": 0.2876, "step": 1640 }, { "epoch": 2.617224880382775, "grad_norm": 0.18798837658713757, "learning_rate": 7.092198581560285e-06, "loss": 0.2761, "step": 1641 }, { "epoch": 2.6188197767145134, "grad_norm": 0.18275623306092637, "learning_rate": 7.062647754137117e-06, "loss": 0.2828, "step": 1642 }, { "epoch": 2.620414673046252, "grad_norm": 0.18298566542798117, "learning_rate": 7.0330969267139485e-06, "loss": 0.2695, "step": 1643 }, { "epoch": 2.6220095693779903, "grad_norm": 0.18963053765377597, "learning_rate": 7.003546099290781e-06, "loss": 0.3067, "step": 1644 }, { "epoch": 2.623604465709729, "grad_norm": 0.18950279165727615, "learning_rate": 6.973995271867613e-06, "loss": 0.2943, "step": 1645 }, { "epoch": 2.6251993620414673, "grad_norm": 0.1895136916473773, "learning_rate": 6.944444444444445e-06, "loss": 0.2866, "step": 1646 }, { "epoch": 2.626794258373206, "grad_norm": 0.19469593607412405, "learning_rate": 6.914893617021277e-06, "loss": 0.2832, "step": 1647 }, { "epoch": 2.6283891547049443, "grad_norm": 0.17376999325162917, "learning_rate": 6.8853427895981095e-06, "loss": 0.2657, "step": 1648 }, { "epoch": 2.6299840510366828, "grad_norm": 0.1885387750740932, "learning_rate": 6.855791962174941e-06, "loss": 0.2902, "step": 1649 }, { "epoch": 2.6315789473684212, "grad_norm": 0.18690929164167, "learning_rate": 6.826241134751773e-06, "loss": 0.3096, "step": 1650 }, { "epoch": 2.6331738437001597, "grad_norm": 0.18578658664270561, "learning_rate": 6.796690307328606e-06, "loss": 0.2733, "step": 1651 }, { "epoch": 2.6347687400318978, "grad_norm": 0.18811487324755335, "learning_rate": 6.767139479905438e-06, "loss": 0.2752, "step": 1652 }, { "epoch": 2.6363636363636362, "grad_norm": 0.18718502360200667, "learning_rate": 6.73758865248227e-06, "loss": 0.2665, "step": 1653 }, { "epoch": 2.6379585326953747, "grad_norm": 0.18741163426013568, "learning_rate": 6.708037825059102e-06, "loss": 0.3098, "step": 1654 }, { "epoch": 2.639553429027113, "grad_norm": 0.18815181674979198, "learning_rate": 6.678486997635934e-06, "loss": 0.3028, "step": 1655 }, { "epoch": 2.6411483253588517, "grad_norm": 0.18809545156650403, "learning_rate": 6.648936170212766e-06, "loss": 0.2877, "step": 1656 }, { "epoch": 2.64274322169059, "grad_norm": 0.19977833281654508, "learning_rate": 6.619385342789598e-06, "loss": 0.2866, "step": 1657 }, { "epoch": 2.6443381180223287, "grad_norm": 0.20922428886952288, "learning_rate": 6.589834515366431e-06, "loss": 0.2808, "step": 1658 }, { "epoch": 2.645933014354067, "grad_norm": 0.1804228202691039, "learning_rate": 6.560283687943262e-06, "loss": 0.2724, "step": 1659 }, { "epoch": 2.647527910685805, "grad_norm": 0.1822686809029519, "learning_rate": 6.5307328605200945e-06, "loss": 0.2705, "step": 1660 }, { "epoch": 2.6491228070175437, "grad_norm": 0.18301128966439506, "learning_rate": 6.501182033096927e-06, "loss": 0.2847, "step": 1661 }, { "epoch": 2.650717703349282, "grad_norm": 0.20211409613988554, "learning_rate": 6.471631205673758e-06, "loss": 0.2795, "step": 1662 }, { "epoch": 2.6523125996810206, "grad_norm": 0.18845378747367333, "learning_rate": 6.442080378250591e-06, "loss": 0.2714, "step": 1663 }, { "epoch": 2.653907496012759, "grad_norm": 0.19888813200069672, "learning_rate": 6.412529550827423e-06, "loss": 0.2927, "step": 1664 }, { "epoch": 2.6555023923444976, "grad_norm": 0.1953257367254546, "learning_rate": 6.3829787234042555e-06, "loss": 0.2838, "step": 1665 }, { "epoch": 2.657097288676236, "grad_norm": 0.18771360379521002, "learning_rate": 6.353427895981087e-06, "loss": 0.2952, "step": 1666 }, { "epoch": 2.6586921850079746, "grad_norm": 0.1976741404415028, "learning_rate": 6.323877068557919e-06, "loss": 0.2898, "step": 1667 }, { "epoch": 2.660287081339713, "grad_norm": 0.21383410061084837, "learning_rate": 6.294326241134752e-06, "loss": 0.2945, "step": 1668 }, { "epoch": 2.6618819776714515, "grad_norm": 0.19484298776539988, "learning_rate": 6.264775413711583e-06, "loss": 0.2946, "step": 1669 }, { "epoch": 2.66347687400319, "grad_norm": 0.18811991361202637, "learning_rate": 6.2352245862884165e-06, "loss": 0.2816, "step": 1670 }, { "epoch": 2.6650717703349285, "grad_norm": 0.1981313297135003, "learning_rate": 6.205673758865249e-06, "loss": 0.279, "step": 1671 }, { "epoch": 2.6666666666666665, "grad_norm": 0.17909777919997777, "learning_rate": 6.17612293144208e-06, "loss": 0.2755, "step": 1672 }, { "epoch": 2.668261562998405, "grad_norm": 0.18823151570626886, "learning_rate": 6.146572104018913e-06, "loss": 0.2857, "step": 1673 }, { "epoch": 2.6698564593301435, "grad_norm": 0.182000099586149, "learning_rate": 6.117021276595745e-06, "loss": 0.2717, "step": 1674 }, { "epoch": 2.671451355661882, "grad_norm": 0.17823054278949657, "learning_rate": 6.087470449172577e-06, "loss": 0.2791, "step": 1675 }, { "epoch": 2.6730462519936204, "grad_norm": 0.18419639947949684, "learning_rate": 6.057919621749409e-06, "loss": 0.2886, "step": 1676 }, { "epoch": 2.674641148325359, "grad_norm": 0.18636339243889496, "learning_rate": 6.028368794326241e-06, "loss": 0.2846, "step": 1677 }, { "epoch": 2.6762360446570974, "grad_norm": 0.188314413135542, "learning_rate": 5.998817966903074e-06, "loss": 0.2992, "step": 1678 }, { "epoch": 2.6778309409888355, "grad_norm": 0.1948841146215719, "learning_rate": 5.969267139479906e-06, "loss": 0.3002, "step": 1679 }, { "epoch": 2.679425837320574, "grad_norm": 0.18976731019545856, "learning_rate": 5.9397163120567384e-06, "loss": 0.2791, "step": 1680 }, { "epoch": 2.6810207336523124, "grad_norm": 0.1938385754755735, "learning_rate": 5.91016548463357e-06, "loss": 0.2953, "step": 1681 }, { "epoch": 2.682615629984051, "grad_norm": 0.17797478793443586, "learning_rate": 5.880614657210402e-06, "loss": 0.2851, "step": 1682 }, { "epoch": 2.6842105263157894, "grad_norm": 0.19117519097745528, "learning_rate": 5.851063829787235e-06, "loss": 0.2849, "step": 1683 }, { "epoch": 2.685805422647528, "grad_norm": 0.18234347839255963, "learning_rate": 5.821513002364066e-06, "loss": 0.2847, "step": 1684 }, { "epoch": 2.6874003189792663, "grad_norm": 0.1771123877937242, "learning_rate": 5.791962174940899e-06, "loss": 0.2804, "step": 1685 }, { "epoch": 2.688995215311005, "grad_norm": 0.18435336659547338, "learning_rate": 5.762411347517731e-06, "loss": 0.2605, "step": 1686 }, { "epoch": 2.6905901116427433, "grad_norm": 0.18136522555121193, "learning_rate": 5.7328605200945625e-06, "loss": 0.2799, "step": 1687 }, { "epoch": 2.692185007974482, "grad_norm": 0.1892267637389365, "learning_rate": 5.703309692671395e-06, "loss": 0.2873, "step": 1688 }, { "epoch": 2.6937799043062203, "grad_norm": 0.18194730686013494, "learning_rate": 5.673758865248227e-06, "loss": 0.2742, "step": 1689 }, { "epoch": 2.6953748006379588, "grad_norm": 0.17933584042409917, "learning_rate": 5.6442080378250596e-06, "loss": 0.287, "step": 1690 }, { "epoch": 2.6969696969696972, "grad_norm": 0.18127732309303005, "learning_rate": 5.614657210401891e-06, "loss": 0.2845, "step": 1691 }, { "epoch": 2.6985645933014353, "grad_norm": 0.17593909079641093, "learning_rate": 5.5851063829787235e-06, "loss": 0.2866, "step": 1692 }, { "epoch": 2.7001594896331738, "grad_norm": 0.1763272518415149, "learning_rate": 5.555555555555556e-06, "loss": 0.2819, "step": 1693 }, { "epoch": 2.7017543859649122, "grad_norm": 0.17959591044978926, "learning_rate": 5.526004728132388e-06, "loss": 0.2833, "step": 1694 }, { "epoch": 2.7033492822966507, "grad_norm": 0.18598328267085493, "learning_rate": 5.4964539007092206e-06, "loss": 0.2998, "step": 1695 }, { "epoch": 2.704944178628389, "grad_norm": 0.18620435854570758, "learning_rate": 5.466903073286053e-06, "loss": 0.2763, "step": 1696 }, { "epoch": 2.7065390749601277, "grad_norm": 0.18272021261547022, "learning_rate": 5.4373522458628844e-06, "loss": 0.2857, "step": 1697 }, { "epoch": 2.708133971291866, "grad_norm": 0.17826006490416488, "learning_rate": 5.407801418439717e-06, "loss": 0.2817, "step": 1698 }, { "epoch": 2.709728867623604, "grad_norm": 0.1841703662061206, "learning_rate": 5.378250591016549e-06, "loss": 0.2999, "step": 1699 }, { "epoch": 2.7113237639553427, "grad_norm": 0.1980794778450659, "learning_rate": 5.348699763593381e-06, "loss": 0.3069, "step": 1700 }, { "epoch": 2.712918660287081, "grad_norm": 0.18988986101598312, "learning_rate": 5.319148936170213e-06, "loss": 0.2729, "step": 1701 }, { "epoch": 2.7145135566188197, "grad_norm": 0.1746929269301279, "learning_rate": 5.289598108747045e-06, "loss": 0.2911, "step": 1702 }, { "epoch": 2.716108452950558, "grad_norm": 0.17947929244683142, "learning_rate": 5.260047281323877e-06, "loss": 0.2814, "step": 1703 }, { "epoch": 2.7177033492822966, "grad_norm": 0.17312963409200433, "learning_rate": 5.230496453900709e-06, "loss": 0.266, "step": 1704 }, { "epoch": 2.719298245614035, "grad_norm": 0.1723025204751791, "learning_rate": 5.200945626477542e-06, "loss": 0.2825, "step": 1705 }, { "epoch": 2.7208931419457736, "grad_norm": 0.18496166549362708, "learning_rate": 5.171394799054373e-06, "loss": 0.2828, "step": 1706 }, { "epoch": 2.722488038277512, "grad_norm": 0.18466021102045926, "learning_rate": 5.1418439716312056e-06, "loss": 0.2881, "step": 1707 }, { "epoch": 2.7240829346092506, "grad_norm": 0.18630007842938237, "learning_rate": 5.112293144208038e-06, "loss": 0.2845, "step": 1708 }, { "epoch": 2.725677830940989, "grad_norm": 0.1770324804521593, "learning_rate": 5.08274231678487e-06, "loss": 0.2745, "step": 1709 }, { "epoch": 2.7272727272727275, "grad_norm": 0.18671370525404168, "learning_rate": 5.053191489361702e-06, "loss": 0.2932, "step": 1710 }, { "epoch": 2.7288676236044656, "grad_norm": 0.17820747592982733, "learning_rate": 5.023640661938535e-06, "loss": 0.2773, "step": 1711 }, { "epoch": 2.730462519936204, "grad_norm": 0.18101847714533223, "learning_rate": 4.9940898345153665e-06, "loss": 0.272, "step": 1712 }, { "epoch": 2.7320574162679425, "grad_norm": 0.17930051867895408, "learning_rate": 4.964539007092199e-06, "loss": 0.2894, "step": 1713 }, { "epoch": 2.733652312599681, "grad_norm": 0.18458069539858743, "learning_rate": 4.934988179669031e-06, "loss": 0.272, "step": 1714 }, { "epoch": 2.7352472089314195, "grad_norm": 0.17978175879985378, "learning_rate": 4.905437352245864e-06, "loss": 0.2917, "step": 1715 }, { "epoch": 2.736842105263158, "grad_norm": 0.17787974128160833, "learning_rate": 4.875886524822695e-06, "loss": 0.2804, "step": 1716 }, { "epoch": 2.7384370015948964, "grad_norm": 0.1758375514258994, "learning_rate": 4.8463356973995275e-06, "loss": 0.2838, "step": 1717 }, { "epoch": 2.740031897926635, "grad_norm": 0.1797066317303636, "learning_rate": 4.81678486997636e-06, "loss": 0.2825, "step": 1718 }, { "epoch": 2.741626794258373, "grad_norm": 0.18335125747129097, "learning_rate": 4.787234042553191e-06, "loss": 0.2848, "step": 1719 }, { "epoch": 2.7432216905901115, "grad_norm": 0.18677877959884115, "learning_rate": 4.757683215130024e-06, "loss": 0.2825, "step": 1720 }, { "epoch": 2.74481658692185, "grad_norm": 0.19242830393478508, "learning_rate": 4.728132387706856e-06, "loss": 0.2791, "step": 1721 }, { "epoch": 2.7464114832535884, "grad_norm": 0.18810671670551757, "learning_rate": 4.698581560283688e-06, "loss": 0.2889, "step": 1722 }, { "epoch": 2.748006379585327, "grad_norm": 0.17731943094652056, "learning_rate": 4.66903073286052e-06, "loss": 0.2885, "step": 1723 }, { "epoch": 2.7496012759170654, "grad_norm": 0.18597824171078742, "learning_rate": 4.639479905437352e-06, "loss": 0.2926, "step": 1724 }, { "epoch": 2.751196172248804, "grad_norm": 0.1830561077652426, "learning_rate": 4.609929078014184e-06, "loss": 0.2978, "step": 1725 }, { "epoch": 2.7527910685805423, "grad_norm": 0.18532998507708187, "learning_rate": 4.580378250591016e-06, "loss": 0.2797, "step": 1726 }, { "epoch": 2.754385964912281, "grad_norm": 0.17569667442848538, "learning_rate": 4.5508274231678495e-06, "loss": 0.2884, "step": 1727 }, { "epoch": 2.7559808612440193, "grad_norm": 0.18080314142919746, "learning_rate": 4.521276595744681e-06, "loss": 0.2773, "step": 1728 }, { "epoch": 2.757575757575758, "grad_norm": 0.17751089977928947, "learning_rate": 4.491725768321513e-06, "loss": 0.2859, "step": 1729 }, { "epoch": 2.7591706539074963, "grad_norm": 0.17956015919680968, "learning_rate": 4.462174940898346e-06, "loss": 0.2798, "step": 1730 }, { "epoch": 2.7607655502392343, "grad_norm": 0.18456714746777728, "learning_rate": 4.432624113475177e-06, "loss": 0.2875, "step": 1731 }, { "epoch": 2.762360446570973, "grad_norm": 0.18760603363340728, "learning_rate": 4.40307328605201e-06, "loss": 0.3003, "step": 1732 }, { "epoch": 2.7639553429027113, "grad_norm": 0.18611271462520643, "learning_rate": 4.373522458628842e-06, "loss": 0.2757, "step": 1733 }, { "epoch": 2.7655502392344498, "grad_norm": 0.18072017370301766, "learning_rate": 4.343971631205674e-06, "loss": 0.2953, "step": 1734 }, { "epoch": 2.7671451355661882, "grad_norm": 0.18232024733698296, "learning_rate": 4.314420803782506e-06, "loss": 0.2804, "step": 1735 }, { "epoch": 2.7687400318979267, "grad_norm": 0.17744879681885975, "learning_rate": 4.284869976359338e-06, "loss": 0.2907, "step": 1736 }, { "epoch": 2.770334928229665, "grad_norm": 0.17278197243171844, "learning_rate": 4.255319148936171e-06, "loss": 0.2719, "step": 1737 }, { "epoch": 2.7719298245614032, "grad_norm": 0.17784876695115773, "learning_rate": 4.225768321513002e-06, "loss": 0.2886, "step": 1738 }, { "epoch": 2.7735247208931417, "grad_norm": 0.1813423935078373, "learning_rate": 4.1962174940898345e-06, "loss": 0.2818, "step": 1739 }, { "epoch": 2.77511961722488, "grad_norm": 0.18225744263945784, "learning_rate": 4.166666666666667e-06, "loss": 0.2978, "step": 1740 }, { "epoch": 2.7767145135566187, "grad_norm": 0.17735621196416188, "learning_rate": 4.137115839243498e-06, "loss": 0.2786, "step": 1741 }, { "epoch": 2.778309409888357, "grad_norm": 0.1789763979009108, "learning_rate": 4.107565011820331e-06, "loss": 0.2857, "step": 1742 }, { "epoch": 2.7799043062200957, "grad_norm": 0.19895975336883043, "learning_rate": 4.078014184397164e-06, "loss": 0.2914, "step": 1743 }, { "epoch": 2.781499202551834, "grad_norm": 0.17925399036730771, "learning_rate": 4.0484633569739955e-06, "loss": 0.277, "step": 1744 }, { "epoch": 2.7830940988835726, "grad_norm": 0.18317064711929226, "learning_rate": 4.018912529550828e-06, "loss": 0.2737, "step": 1745 }, { "epoch": 2.784688995215311, "grad_norm": 0.1778614822124566, "learning_rate": 3.98936170212766e-06, "loss": 0.2738, "step": 1746 }, { "epoch": 2.7862838915470496, "grad_norm": 0.17722353188215656, "learning_rate": 3.959810874704492e-06, "loss": 0.2798, "step": 1747 }, { "epoch": 2.787878787878788, "grad_norm": 0.18353778393726702, "learning_rate": 3.930260047281324e-06, "loss": 0.2842, "step": 1748 }, { "epoch": 2.7894736842105265, "grad_norm": 0.19628350261121563, "learning_rate": 3.9007092198581565e-06, "loss": 0.2849, "step": 1749 }, { "epoch": 2.7910685805422646, "grad_norm": 0.18032020702632592, "learning_rate": 3.871158392434988e-06, "loss": 0.2797, "step": 1750 }, { "epoch": 2.792663476874003, "grad_norm": 0.17771605415564826, "learning_rate": 3.84160756501182e-06, "loss": 0.2869, "step": 1751 }, { "epoch": 2.7942583732057416, "grad_norm": 0.17007783356152567, "learning_rate": 3.8120567375886527e-06, "loss": 0.2736, "step": 1752 }, { "epoch": 2.79585326953748, "grad_norm": 0.17712714824233367, "learning_rate": 3.7825059101654847e-06, "loss": 0.2829, "step": 1753 }, { "epoch": 2.7974481658692185, "grad_norm": 0.18069192004897303, "learning_rate": 3.752955082742317e-06, "loss": 0.2837, "step": 1754 }, { "epoch": 2.799043062200957, "grad_norm": 0.178553265681577, "learning_rate": 3.723404255319149e-06, "loss": 0.266, "step": 1755 }, { "epoch": 2.8006379585326955, "grad_norm": 0.17851569725907315, "learning_rate": 3.693853427895981e-06, "loss": 0.2662, "step": 1756 }, { "epoch": 2.802232854864434, "grad_norm": 0.1754202251015534, "learning_rate": 3.6643026004728133e-06, "loss": 0.278, "step": 1757 }, { "epoch": 2.803827751196172, "grad_norm": 0.1933666615044741, "learning_rate": 3.6347517730496452e-06, "loss": 0.2987, "step": 1758 }, { "epoch": 2.8054226475279105, "grad_norm": 0.18579234672077763, "learning_rate": 3.605200945626478e-06, "loss": 0.2885, "step": 1759 }, { "epoch": 2.807017543859649, "grad_norm": 0.17447977375321475, "learning_rate": 3.5756501182033104e-06, "loss": 0.29, "step": 1760 }, { "epoch": 2.8086124401913874, "grad_norm": 0.18352080223087583, "learning_rate": 3.5460992907801423e-06, "loss": 0.2781, "step": 1761 }, { "epoch": 2.810207336523126, "grad_norm": 0.18355908638644333, "learning_rate": 3.5165484633569743e-06, "loss": 0.284, "step": 1762 }, { "epoch": 2.8118022328548644, "grad_norm": 0.1890069487654065, "learning_rate": 3.4869976359338066e-06, "loss": 0.2913, "step": 1763 }, { "epoch": 2.813397129186603, "grad_norm": 0.18896286836594003, "learning_rate": 3.4574468085106386e-06, "loss": 0.2913, "step": 1764 }, { "epoch": 2.8149920255183414, "grad_norm": 0.17967768099605255, "learning_rate": 3.4278959810874705e-06, "loss": 0.2863, "step": 1765 }, { "epoch": 2.81658692185008, "grad_norm": 0.17387467193091, "learning_rate": 3.398345153664303e-06, "loss": 0.2706, "step": 1766 }, { "epoch": 2.8181818181818183, "grad_norm": 0.18084464274653345, "learning_rate": 3.368794326241135e-06, "loss": 0.2944, "step": 1767 }, { "epoch": 2.819776714513557, "grad_norm": 0.17648290696508814, "learning_rate": 3.339243498817967e-06, "loss": 0.2834, "step": 1768 }, { "epoch": 2.8213716108452953, "grad_norm": 0.19250172087957507, "learning_rate": 3.309692671394799e-06, "loss": 0.2882, "step": 1769 }, { "epoch": 2.8229665071770333, "grad_norm": 0.18971035434351524, "learning_rate": 3.280141843971631e-06, "loss": 0.2812, "step": 1770 }, { "epoch": 2.824561403508772, "grad_norm": 0.1801286920547348, "learning_rate": 3.2505910165484634e-06, "loss": 0.288, "step": 1771 }, { "epoch": 2.8261562998405103, "grad_norm": 0.17585151133560187, "learning_rate": 3.2210401891252954e-06, "loss": 0.2747, "step": 1772 }, { "epoch": 2.827751196172249, "grad_norm": 0.19014168343837856, "learning_rate": 3.1914893617021277e-06, "loss": 0.3034, "step": 1773 }, { "epoch": 2.8293460925039873, "grad_norm": 0.17011471651029966, "learning_rate": 3.1619385342789597e-06, "loss": 0.2746, "step": 1774 }, { "epoch": 2.8309409888357258, "grad_norm": 0.17994350083269706, "learning_rate": 3.1323877068557916e-06, "loss": 0.2822, "step": 1775 }, { "epoch": 2.8325358851674642, "grad_norm": 0.1827647588443468, "learning_rate": 3.1028368794326244e-06, "loss": 0.2877, "step": 1776 }, { "epoch": 2.8341307814992023, "grad_norm": 0.1870850134725146, "learning_rate": 3.0732860520094564e-06, "loss": 0.2798, "step": 1777 }, { "epoch": 2.8357256778309408, "grad_norm": 0.1776653984789135, "learning_rate": 3.0437352245862883e-06, "loss": 0.2684, "step": 1778 }, { "epoch": 2.8373205741626792, "grad_norm": 0.17516231142444352, "learning_rate": 3.0141843971631207e-06, "loss": 0.2828, "step": 1779 }, { "epoch": 2.8389154704944177, "grad_norm": 0.17871525332443872, "learning_rate": 2.984633569739953e-06, "loss": 0.2764, "step": 1780 }, { "epoch": 2.840510366826156, "grad_norm": 0.1903557859603168, "learning_rate": 2.955082742316785e-06, "loss": 0.2878, "step": 1781 }, { "epoch": 2.8421052631578947, "grad_norm": 0.18297464073537237, "learning_rate": 2.9255319148936174e-06, "loss": 0.2837, "step": 1782 }, { "epoch": 2.843700159489633, "grad_norm": 0.184024311965064, "learning_rate": 2.8959810874704493e-06, "loss": 0.2699, "step": 1783 }, { "epoch": 2.8452950558213717, "grad_norm": 0.18598384235049317, "learning_rate": 2.8664302600472812e-06, "loss": 0.2886, "step": 1784 }, { "epoch": 2.84688995215311, "grad_norm": 0.18127734623189712, "learning_rate": 2.8368794326241136e-06, "loss": 0.3046, "step": 1785 }, { "epoch": 2.8484848484848486, "grad_norm": 0.1809864903222878, "learning_rate": 2.8073286052009455e-06, "loss": 0.2881, "step": 1786 }, { "epoch": 2.850079744816587, "grad_norm": 0.1659422039759691, "learning_rate": 2.777777777777778e-06, "loss": 0.2763, "step": 1787 }, { "epoch": 2.8516746411483256, "grad_norm": 0.1749264016846234, "learning_rate": 2.7482269503546103e-06, "loss": 0.2827, "step": 1788 }, { "epoch": 2.853269537480064, "grad_norm": 0.17994686558904968, "learning_rate": 2.7186761229314422e-06, "loss": 0.2926, "step": 1789 }, { "epoch": 2.854864433811802, "grad_norm": 0.1873643963905803, "learning_rate": 2.6891252955082746e-06, "loss": 0.2757, "step": 1790 }, { "epoch": 2.8564593301435406, "grad_norm": 0.1858497739676943, "learning_rate": 2.6595744680851065e-06, "loss": 0.2922, "step": 1791 }, { "epoch": 2.858054226475279, "grad_norm": 0.18018075580738477, "learning_rate": 2.6300236406619385e-06, "loss": 0.2851, "step": 1792 }, { "epoch": 2.8596491228070176, "grad_norm": 0.18058156955236493, "learning_rate": 2.600472813238771e-06, "loss": 0.291, "step": 1793 }, { "epoch": 2.861244019138756, "grad_norm": 0.17466669032325252, "learning_rate": 2.5709219858156028e-06, "loss": 0.2656, "step": 1794 }, { "epoch": 2.8628389154704945, "grad_norm": 0.17913044785786505, "learning_rate": 2.541371158392435e-06, "loss": 0.2749, "step": 1795 }, { "epoch": 2.864433811802233, "grad_norm": 0.1730162953796426, "learning_rate": 2.5118203309692675e-06, "loss": 0.2875, "step": 1796 }, { "epoch": 2.866028708133971, "grad_norm": 0.18158045368351813, "learning_rate": 2.4822695035460995e-06, "loss": 0.2906, "step": 1797 }, { "epoch": 2.8676236044657095, "grad_norm": 0.1862316218899069, "learning_rate": 2.452718676122932e-06, "loss": 0.2775, "step": 1798 }, { "epoch": 2.869218500797448, "grad_norm": 0.17734957465366474, "learning_rate": 2.4231678486997638e-06, "loss": 0.2784, "step": 1799 }, { "epoch": 2.8708133971291865, "grad_norm": 0.18345349851292475, "learning_rate": 2.3936170212765957e-06, "loss": 0.2694, "step": 1800 }, { "epoch": 2.872408293460925, "grad_norm": 0.17955168108412065, "learning_rate": 2.364066193853428e-06, "loss": 0.2759, "step": 1801 }, { "epoch": 2.8740031897926634, "grad_norm": 0.175526259696356, "learning_rate": 2.33451536643026e-06, "loss": 0.2822, "step": 1802 }, { "epoch": 2.875598086124402, "grad_norm": 0.17377263662612072, "learning_rate": 2.304964539007092e-06, "loss": 0.2744, "step": 1803 }, { "epoch": 2.8771929824561404, "grad_norm": 0.19305719965872345, "learning_rate": 2.2754137115839247e-06, "loss": 0.2874, "step": 1804 }, { "epoch": 2.878787878787879, "grad_norm": 0.17647954874125, "learning_rate": 2.2458628841607567e-06, "loss": 0.2768, "step": 1805 }, { "epoch": 2.8803827751196174, "grad_norm": 0.17069273562047857, "learning_rate": 2.2163120567375886e-06, "loss": 0.2731, "step": 1806 }, { "epoch": 2.881977671451356, "grad_norm": 0.18534675578084803, "learning_rate": 2.186761229314421e-06, "loss": 0.3019, "step": 1807 }, { "epoch": 2.8835725677830943, "grad_norm": 0.18636310769663927, "learning_rate": 2.157210401891253e-06, "loss": 0.2659, "step": 1808 }, { "epoch": 2.8851674641148324, "grad_norm": 0.17209062080161636, "learning_rate": 2.1276595744680853e-06, "loss": 0.2748, "step": 1809 }, { "epoch": 2.886762360446571, "grad_norm": 0.1779104694557588, "learning_rate": 2.0981087470449173e-06, "loss": 0.2812, "step": 1810 }, { "epoch": 2.8883572567783093, "grad_norm": 0.18511225899175818, "learning_rate": 2.068557919621749e-06, "loss": 0.3016, "step": 1811 }, { "epoch": 2.889952153110048, "grad_norm": 0.17162331146772472, "learning_rate": 2.039007092198582e-06, "loss": 0.2702, "step": 1812 }, { "epoch": 2.8915470494417863, "grad_norm": 0.18053704692386963, "learning_rate": 2.009456264775414e-06, "loss": 0.2846, "step": 1813 }, { "epoch": 2.893141945773525, "grad_norm": 0.17386130673626354, "learning_rate": 1.979905437352246e-06, "loss": 0.2731, "step": 1814 }, { "epoch": 2.8947368421052633, "grad_norm": 0.17939304415726964, "learning_rate": 1.9503546099290782e-06, "loss": 0.2855, "step": 1815 }, { "epoch": 2.8963317384370018, "grad_norm": 0.18774294274852413, "learning_rate": 1.92080378250591e-06, "loss": 0.2867, "step": 1816 }, { "epoch": 2.89792663476874, "grad_norm": 0.17551414823363473, "learning_rate": 1.8912529550827423e-06, "loss": 0.2819, "step": 1817 }, { "epoch": 2.8995215311004783, "grad_norm": 0.1788783910315461, "learning_rate": 1.8617021276595745e-06, "loss": 0.2764, "step": 1818 }, { "epoch": 2.9011164274322168, "grad_norm": 0.17884220017188104, "learning_rate": 1.8321513002364066e-06, "loss": 0.2755, "step": 1819 }, { "epoch": 2.9027113237639552, "grad_norm": 0.17539751493243738, "learning_rate": 1.802600472813239e-06, "loss": 0.29, "step": 1820 }, { "epoch": 2.9043062200956937, "grad_norm": 0.17869939030304596, "learning_rate": 1.7730496453900712e-06, "loss": 0.2728, "step": 1821 }, { "epoch": 2.905901116427432, "grad_norm": 0.18058472658486602, "learning_rate": 1.7434988179669033e-06, "loss": 0.2917, "step": 1822 }, { "epoch": 2.9074960127591707, "grad_norm": 0.1781107695906984, "learning_rate": 1.7139479905437353e-06, "loss": 0.2902, "step": 1823 }, { "epoch": 2.909090909090909, "grad_norm": 0.19326197085637786, "learning_rate": 1.6843971631205674e-06, "loss": 0.2916, "step": 1824 }, { "epoch": 2.9106858054226477, "grad_norm": 0.1694780268929849, "learning_rate": 1.6548463356973996e-06, "loss": 0.2669, "step": 1825 }, { "epoch": 2.912280701754386, "grad_norm": 0.17343529785013442, "learning_rate": 1.6252955082742317e-06, "loss": 0.2802, "step": 1826 }, { "epoch": 2.9138755980861246, "grad_norm": 0.17287128879665223, "learning_rate": 1.5957446808510639e-06, "loss": 0.2844, "step": 1827 }, { "epoch": 2.915470494417863, "grad_norm": 0.16799397617277284, "learning_rate": 1.5661938534278958e-06, "loss": 0.2764, "step": 1828 }, { "epoch": 2.917065390749601, "grad_norm": 0.1704451337865313, "learning_rate": 1.5366430260047282e-06, "loss": 0.2827, "step": 1829 }, { "epoch": 2.9186602870813396, "grad_norm": 0.17807741752274714, "learning_rate": 1.5070921985815603e-06, "loss": 0.2838, "step": 1830 }, { "epoch": 2.920255183413078, "grad_norm": 0.17565671527908897, "learning_rate": 1.4775413711583925e-06, "loss": 0.2782, "step": 1831 }, { "epoch": 2.9218500797448166, "grad_norm": 0.17527989874214867, "learning_rate": 1.4479905437352246e-06, "loss": 0.2802, "step": 1832 }, { "epoch": 2.923444976076555, "grad_norm": 0.18065686429704436, "learning_rate": 1.4184397163120568e-06, "loss": 0.2784, "step": 1833 }, { "epoch": 2.9250398724082936, "grad_norm": 0.1713271084290113, "learning_rate": 1.388888888888889e-06, "loss": 0.2756, "step": 1834 }, { "epoch": 2.926634768740032, "grad_norm": 0.175257265831353, "learning_rate": 1.3593380614657211e-06, "loss": 0.2756, "step": 1835 }, { "epoch": 2.92822966507177, "grad_norm": 0.18147784603364306, "learning_rate": 1.3297872340425533e-06, "loss": 0.2852, "step": 1836 }, { "epoch": 2.9298245614035086, "grad_norm": 0.17205212526925448, "learning_rate": 1.3002364066193854e-06, "loss": 0.281, "step": 1837 }, { "epoch": 2.931419457735247, "grad_norm": 0.18191625114834736, "learning_rate": 1.2706855791962176e-06, "loss": 0.2832, "step": 1838 }, { "epoch": 2.9330143540669855, "grad_norm": 0.17836153978961786, "learning_rate": 1.2411347517730497e-06, "loss": 0.2892, "step": 1839 }, { "epoch": 2.934609250398724, "grad_norm": 0.1758182644529857, "learning_rate": 1.2115839243498819e-06, "loss": 0.2924, "step": 1840 }, { "epoch": 2.9362041467304625, "grad_norm": 0.17781569791172092, "learning_rate": 1.182033096926714e-06, "loss": 0.2657, "step": 1841 }, { "epoch": 2.937799043062201, "grad_norm": 0.2156312960862922, "learning_rate": 1.152482269503546e-06, "loss": 0.2844, "step": 1842 }, { "epoch": 2.9393939393939394, "grad_norm": 0.17251153597539867, "learning_rate": 1.1229314420803783e-06, "loss": 0.2797, "step": 1843 }, { "epoch": 2.940988835725678, "grad_norm": 0.1765763166984664, "learning_rate": 1.0933806146572105e-06, "loss": 0.2896, "step": 1844 }, { "epoch": 2.9425837320574164, "grad_norm": 0.18604855639724013, "learning_rate": 1.0638297872340427e-06, "loss": 0.2876, "step": 1845 }, { "epoch": 2.944178628389155, "grad_norm": 0.1817616482033588, "learning_rate": 1.0342789598108746e-06, "loss": 0.2805, "step": 1846 }, { "epoch": 2.9457735247208934, "grad_norm": 0.1890994175822004, "learning_rate": 1.004728132387707e-06, "loss": 0.2828, "step": 1847 }, { "epoch": 2.9473684210526314, "grad_norm": 0.1727885831615006, "learning_rate": 9.751773049645391e-07, "loss": 0.2781, "step": 1848 }, { "epoch": 2.94896331738437, "grad_norm": 0.16544774939552032, "learning_rate": 9.456264775413712e-07, "loss": 0.261, "step": 1849 }, { "epoch": 2.9505582137161084, "grad_norm": 0.17013312110722312, "learning_rate": 9.160756501182033e-07, "loss": 0.279, "step": 1850 }, { "epoch": 2.952153110047847, "grad_norm": 0.17486740712093662, "learning_rate": 8.865248226950356e-07, "loss": 0.2863, "step": 1851 }, { "epoch": 2.9537480063795853, "grad_norm": 0.17754191944048953, "learning_rate": 8.569739952718676e-07, "loss": 0.2811, "step": 1852 }, { "epoch": 2.955342902711324, "grad_norm": 0.1787039568724215, "learning_rate": 8.274231678486998e-07, "loss": 0.2925, "step": 1853 }, { "epoch": 2.9569377990430623, "grad_norm": 0.1687984600031586, "learning_rate": 7.978723404255319e-07, "loss": 0.2786, "step": 1854 }, { "epoch": 2.958532695374801, "grad_norm": 0.17794932117668213, "learning_rate": 7.683215130023641e-07, "loss": 0.2739, "step": 1855 }, { "epoch": 2.960127591706539, "grad_norm": 0.17567923462393542, "learning_rate": 7.387706855791962e-07, "loss": 0.2686, "step": 1856 }, { "epoch": 2.9617224880382773, "grad_norm": 0.17619770314762095, "learning_rate": 7.092198581560284e-07, "loss": 0.2905, "step": 1857 }, { "epoch": 2.963317384370016, "grad_norm": 0.1705287370033242, "learning_rate": 6.796690307328606e-07, "loss": 0.2823, "step": 1858 }, { "epoch": 2.9649122807017543, "grad_norm": 0.18524573693219612, "learning_rate": 6.501182033096927e-07, "loss": 0.2916, "step": 1859 }, { "epoch": 2.9665071770334928, "grad_norm": 0.17185126558965574, "learning_rate": 6.205673758865249e-07, "loss": 0.2717, "step": 1860 }, { "epoch": 2.9681020733652312, "grad_norm": 0.17867017629975918, "learning_rate": 5.91016548463357e-07, "loss": 0.2724, "step": 1861 }, { "epoch": 2.9696969696969697, "grad_norm": 0.17387596687741813, "learning_rate": 5.614657210401892e-07, "loss": 0.2826, "step": 1862 }, { "epoch": 2.971291866028708, "grad_norm": 0.17340184441090958, "learning_rate": 5.319148936170213e-07, "loss": 0.2699, "step": 1863 }, { "epoch": 2.9728867623604467, "grad_norm": 0.16804399672003426, "learning_rate": 5.023640661938535e-07, "loss": 0.2713, "step": 1864 }, { "epoch": 2.974481658692185, "grad_norm": 0.16834383768122393, "learning_rate": 4.728132387706856e-07, "loss": 0.2657, "step": 1865 }, { "epoch": 2.9760765550239237, "grad_norm": 0.17193457990019617, "learning_rate": 4.432624113475178e-07, "loss": 0.2757, "step": 1866 }, { "epoch": 2.977671451355662, "grad_norm": 0.17191343304840792, "learning_rate": 4.137115839243499e-07, "loss": 0.2854, "step": 1867 }, { "epoch": 2.9792663476874, "grad_norm": 0.1792734010197656, "learning_rate": 3.8416075650118205e-07, "loss": 0.2893, "step": 1868 }, { "epoch": 2.9808612440191387, "grad_norm": 0.1816250282359354, "learning_rate": 3.546099290780142e-07, "loss": 0.2816, "step": 1869 }, { "epoch": 2.982456140350877, "grad_norm": 0.17428847196847813, "learning_rate": 3.2505910165484635e-07, "loss": 0.2933, "step": 1870 }, { "epoch": 2.9840510366826156, "grad_norm": 0.17606050710425192, "learning_rate": 2.955082742316785e-07, "loss": 0.2777, "step": 1871 }, { "epoch": 2.985645933014354, "grad_norm": 0.16951386218131437, "learning_rate": 2.6595744680851066e-07, "loss": 0.2803, "step": 1872 }, { "epoch": 2.9872408293460926, "grad_norm": 0.1746597320358437, "learning_rate": 2.364066193853428e-07, "loss": 0.2719, "step": 1873 }, { "epoch": 2.988835725677831, "grad_norm": 0.16553050962335797, "learning_rate": 2.0685579196217495e-07, "loss": 0.2782, "step": 1874 }, { "epoch": 2.990430622009569, "grad_norm": 0.16900298289393745, "learning_rate": 1.773049645390071e-07, "loss": 0.2774, "step": 1875 }, { "epoch": 2.9920255183413076, "grad_norm": 0.16472982695332425, "learning_rate": 1.4775413711583925e-07, "loss": 0.2678, "step": 1876 }, { "epoch": 2.993620414673046, "grad_norm": 0.17788673254105541, "learning_rate": 1.182033096926714e-07, "loss": 0.2738, "step": 1877 }, { "epoch": 2.9952153110047846, "grad_norm": 0.17280154399228578, "learning_rate": 8.865248226950355e-08, "loss": 0.2798, "step": 1878 }, { "epoch": 2.996810207336523, "grad_norm": 0.1702191295232523, "learning_rate": 5.91016548463357e-08, "loss": 0.2775, "step": 1879 }, { "epoch": 2.9984051036682615, "grad_norm": 0.17545005071335465, "learning_rate": 2.955082742316785e-08, "loss": 0.2731, "step": 1880 }, { "epoch": 3.0, "grad_norm": 0.1665535899667644, "learning_rate": 0.0, "loss": 0.2713, "step": 1881 }, { "epoch": 3.0, "step": 1881, "total_flos": 2.091981614752989e+19, "train_loss": 0.44952398858635684, "train_runtime": 56118.6705, "train_samples_per_second": 0.536, "train_steps_per_second": 0.034 } ], "logging_steps": 1, "max_steps": 1881, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.091981614752989e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }