random_eMNE67pUJEhYppA3 / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
34df508 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016806722689075631,
"grad_norm": 8.21841464435543,
"learning_rate": 9.99998257609161e-06,
"loss": 0.3444,
"step": 1
},
{
"epoch": 0.0033613445378151263,
"grad_norm": 6.5307561999009405,
"learning_rate": 9.999930304487874e-06,
"loss": 0.2694,
"step": 2
},
{
"epoch": 0.005042016806722689,
"grad_norm": 3.8967180599837037,
"learning_rate": 9.999843185553106e-06,
"loss": 0.1601,
"step": 3
},
{
"epoch": 0.0067226890756302525,
"grad_norm": 3.516207675689668,
"learning_rate": 9.999721219894482e-06,
"loss": 0.1494,
"step": 4
},
{
"epoch": 0.008403361344537815,
"grad_norm": 6.088858147617413,
"learning_rate": 9.999564408362054e-06,
"loss": 0.2068,
"step": 5
},
{
"epoch": 0.010084033613445379,
"grad_norm": 4.5663525378609835,
"learning_rate": 9.999372752048729e-06,
"loss": 0.1637,
"step": 6
},
{
"epoch": 0.011764705882352941,
"grad_norm": 4.286095780850501,
"learning_rate": 9.999146252290264e-06,
"loss": 0.1554,
"step": 7
},
{
"epoch": 0.013445378151260505,
"grad_norm": 4.03551304941898,
"learning_rate": 9.998884910665267e-06,
"loss": 0.157,
"step": 8
},
{
"epoch": 0.015126050420168067,
"grad_norm": 5.805511665267424,
"learning_rate": 9.998588728995176e-06,
"loss": 0.2141,
"step": 9
},
{
"epoch": 0.01680672268907563,
"grad_norm": 5.634463236999942,
"learning_rate": 9.998257709344246e-06,
"loss": 0.2417,
"step": 10
},
{
"epoch": 0.018487394957983194,
"grad_norm": 4.639281556442096,
"learning_rate": 9.997891854019538e-06,
"loss": 0.2314,
"step": 11
},
{
"epoch": 0.020168067226890758,
"grad_norm": 4.275446688406603,
"learning_rate": 9.997491165570907e-06,
"loss": 0.183,
"step": 12
},
{
"epoch": 0.021848739495798318,
"grad_norm": 5.1126652407586555,
"learning_rate": 9.997055646790974e-06,
"loss": 0.251,
"step": 13
},
{
"epoch": 0.023529411764705882,
"grad_norm": 5.537773614821847,
"learning_rate": 9.996585300715117e-06,
"loss": 0.2673,
"step": 14
},
{
"epoch": 0.025210084033613446,
"grad_norm": 4.7256726060040455,
"learning_rate": 9.99608013062144e-06,
"loss": 0.1974,
"step": 15
},
{
"epoch": 0.02689075630252101,
"grad_norm": 4.187375261904063,
"learning_rate": 9.995540140030759e-06,
"loss": 0.2111,
"step": 16
},
{
"epoch": 0.02857142857142857,
"grad_norm": 4.0040557938220385,
"learning_rate": 9.994965332706574e-06,
"loss": 0.1859,
"step": 17
},
{
"epoch": 0.030252100840336135,
"grad_norm": 3.3281846391864676,
"learning_rate": 9.99435571265504e-06,
"loss": 0.171,
"step": 18
},
{
"epoch": 0.031932773109243695,
"grad_norm": 4.478925095913666,
"learning_rate": 9.993711284124943e-06,
"loss": 0.1966,
"step": 19
},
{
"epoch": 0.03361344537815126,
"grad_norm": 3.6018656957376756,
"learning_rate": 9.99303205160767e-06,
"loss": 0.2085,
"step": 20
},
{
"epoch": 0.03529411764705882,
"grad_norm": 4.319806671578674,
"learning_rate": 9.992318019837171e-06,
"loss": 0.2047,
"step": 21
},
{
"epoch": 0.03697478991596639,
"grad_norm": 4.620436316163216,
"learning_rate": 9.991569193789938e-06,
"loss": 0.2713,
"step": 22
},
{
"epoch": 0.03865546218487395,
"grad_norm": 4.4035777895289065,
"learning_rate": 9.990785578684963e-06,
"loss": 0.1968,
"step": 23
},
{
"epoch": 0.040336134453781515,
"grad_norm": 4.529546514351305,
"learning_rate": 9.989967179983699e-06,
"loss": 0.2439,
"step": 24
},
{
"epoch": 0.04201680672268908,
"grad_norm": 4.146310383033315,
"learning_rate": 9.989114003390028e-06,
"loss": 0.1803,
"step": 25
},
{
"epoch": 0.043697478991596636,
"grad_norm": 4.256564415881964,
"learning_rate": 9.988226054850218e-06,
"loss": 0.1626,
"step": 26
},
{
"epoch": 0.0453781512605042,
"grad_norm": 4.278060540315039,
"learning_rate": 9.987303340552885e-06,
"loss": 0.2531,
"step": 27
},
{
"epoch": 0.047058823529411764,
"grad_norm": 5.409118538573271,
"learning_rate": 9.98634586692894e-06,
"loss": 0.2607,
"step": 28
},
{
"epoch": 0.04873949579831933,
"grad_norm": 4.529140394915868,
"learning_rate": 9.985353640651563e-06,
"loss": 0.1756,
"step": 29
},
{
"epoch": 0.05042016806722689,
"grad_norm": 5.0629410665648775,
"learning_rate": 9.984326668636131e-06,
"loss": 0.2619,
"step": 30
},
{
"epoch": 0.052100840336134456,
"grad_norm": 5.030852124775789,
"learning_rate": 9.983264958040194e-06,
"loss": 0.2179,
"step": 31
},
{
"epoch": 0.05378151260504202,
"grad_norm": 4.13446898262633,
"learning_rate": 9.98216851626341e-06,
"loss": 0.1483,
"step": 32
},
{
"epoch": 0.05546218487394958,
"grad_norm": 4.718860562037743,
"learning_rate": 9.981037350947503e-06,
"loss": 0.1942,
"step": 33
},
{
"epoch": 0.05714285714285714,
"grad_norm": 5.0987197312854216,
"learning_rate": 9.979871469976197e-06,
"loss": 0.2192,
"step": 34
},
{
"epoch": 0.058823529411764705,
"grad_norm": 4.853234294545507,
"learning_rate": 9.978670881475173e-06,
"loss": 0.2256,
"step": 35
},
{
"epoch": 0.06050420168067227,
"grad_norm": 4.566039439694336,
"learning_rate": 9.977435593812013e-06,
"loss": 0.212,
"step": 36
},
{
"epoch": 0.06218487394957983,
"grad_norm": 4.303938728983624,
"learning_rate": 9.976165615596128e-06,
"loss": 0.1981,
"step": 37
},
{
"epoch": 0.06386554621848739,
"grad_norm": 4.738804474245815,
"learning_rate": 9.974860955678715e-06,
"loss": 0.1928,
"step": 38
},
{
"epoch": 0.06554621848739496,
"grad_norm": 4.74356391080987,
"learning_rate": 9.973521623152682e-06,
"loss": 0.195,
"step": 39
},
{
"epoch": 0.06722689075630252,
"grad_norm": 6.034671221697289,
"learning_rate": 9.972147627352593e-06,
"loss": 0.2954,
"step": 40
},
{
"epoch": 0.06890756302521009,
"grad_norm": 4.6685509141294155,
"learning_rate": 9.970738977854597e-06,
"loss": 0.2195,
"step": 41
},
{
"epoch": 0.07058823529411765,
"grad_norm": 4.564959380430705,
"learning_rate": 9.96929568447637e-06,
"loss": 0.2394,
"step": 42
},
{
"epoch": 0.07226890756302522,
"grad_norm": 3.707111096084297,
"learning_rate": 9.967817757277031e-06,
"loss": 0.1534,
"step": 43
},
{
"epoch": 0.07394957983193277,
"grad_norm": 5.558481032268956,
"learning_rate": 9.966305206557092e-06,
"loss": 0.259,
"step": 44
},
{
"epoch": 0.07563025210084033,
"grad_norm": 4.816786645150557,
"learning_rate": 9.964758042858368e-06,
"loss": 0.3009,
"step": 45
},
{
"epoch": 0.0773109243697479,
"grad_norm": 3.46056254061901,
"learning_rate": 9.963176276963916e-06,
"loss": 0.1647,
"step": 46
},
{
"epoch": 0.07899159663865546,
"grad_norm": 4.324140282243881,
"learning_rate": 9.961559919897954e-06,
"loss": 0.1879,
"step": 47
},
{
"epoch": 0.08067226890756303,
"grad_norm": 4.097033705840372,
"learning_rate": 9.959908982925783e-06,
"loss": 0.2056,
"step": 48
},
{
"epoch": 0.08235294117647059,
"grad_norm": 3.8703298631531573,
"learning_rate": 9.958223477553715e-06,
"loss": 0.2251,
"step": 49
},
{
"epoch": 0.08403361344537816,
"grad_norm": 4.530326484306399,
"learning_rate": 9.956503415528984e-06,
"loss": 0.214,
"step": 50
},
{
"epoch": 0.08571428571428572,
"grad_norm": 3.838645445285574,
"learning_rate": 9.954748808839675e-06,
"loss": 0.1631,
"step": 51
},
{
"epoch": 0.08739495798319327,
"grad_norm": 5.189457107947033,
"learning_rate": 9.952959669714627e-06,
"loss": 0.2349,
"step": 52
},
{
"epoch": 0.08907563025210084,
"grad_norm": 4.478468457730722,
"learning_rate": 9.951136010623359e-06,
"loss": 0.2271,
"step": 53
},
{
"epoch": 0.0907563025210084,
"grad_norm": 3.7459087505453543,
"learning_rate": 9.94927784427598e-06,
"loss": 0.1991,
"step": 54
},
{
"epoch": 0.09243697478991597,
"grad_norm": 4.334863357859166,
"learning_rate": 9.947385183623099e-06,
"loss": 0.1952,
"step": 55
},
{
"epoch": 0.09411764705882353,
"grad_norm": 4.120411980539683,
"learning_rate": 9.945458041855732e-06,
"loss": 0.217,
"step": 56
},
{
"epoch": 0.0957983193277311,
"grad_norm": 3.6830264614513237,
"learning_rate": 9.943496432405213e-06,
"loss": 0.1846,
"step": 57
},
{
"epoch": 0.09747899159663866,
"grad_norm": 3.8118209298483823,
"learning_rate": 9.941500368943111e-06,
"loss": 0.174,
"step": 58
},
{
"epoch": 0.09915966386554621,
"grad_norm": 3.6490768370277,
"learning_rate": 9.939469865381111e-06,
"loss": 0.1764,
"step": 59
},
{
"epoch": 0.10084033613445378,
"grad_norm": 4.594768474393017,
"learning_rate": 9.937404935870938e-06,
"loss": 0.2139,
"step": 60
},
{
"epoch": 0.10252100840336134,
"grad_norm": 3.977673868555674,
"learning_rate": 9.935305594804247e-06,
"loss": 0.1925,
"step": 61
},
{
"epoch": 0.10420168067226891,
"grad_norm": 4.136078170621533,
"learning_rate": 9.933171856812533e-06,
"loss": 0.225,
"step": 62
},
{
"epoch": 0.10588235294117647,
"grad_norm": 4.5341488755616695,
"learning_rate": 9.931003736767013e-06,
"loss": 0.1781,
"step": 63
},
{
"epoch": 0.10756302521008404,
"grad_norm": 4.276594908796245,
"learning_rate": 9.92880124977854e-06,
"loss": 0.2098,
"step": 64
},
{
"epoch": 0.1092436974789916,
"grad_norm": 4.444880797388785,
"learning_rate": 9.926564411197488e-06,
"loss": 0.2456,
"step": 65
},
{
"epoch": 0.11092436974789915,
"grad_norm": 4.0506236653094305,
"learning_rate": 9.924293236613643e-06,
"loss": 0.2102,
"step": 66
},
{
"epoch": 0.11260504201680673,
"grad_norm": 4.545097789043228,
"learning_rate": 9.921987741856099e-06,
"loss": 0.2588,
"step": 67
},
{
"epoch": 0.11428571428571428,
"grad_norm": 3.6500673666849215,
"learning_rate": 9.91964794299315e-06,
"loss": 0.1782,
"step": 68
},
{
"epoch": 0.11596638655462185,
"grad_norm": 4.238844893437317,
"learning_rate": 9.91727385633217e-06,
"loss": 0.1981,
"step": 69
},
{
"epoch": 0.11764705882352941,
"grad_norm": 4.007425712958092,
"learning_rate": 9.91486549841951e-06,
"loss": 0.1857,
"step": 70
},
{
"epoch": 0.11932773109243698,
"grad_norm": 5.073962058395761,
"learning_rate": 9.91242288604037e-06,
"loss": 0.2808,
"step": 71
},
{
"epoch": 0.12100840336134454,
"grad_norm": 3.742305034008983,
"learning_rate": 9.909946036218694e-06,
"loss": 0.2158,
"step": 72
},
{
"epoch": 0.1226890756302521,
"grad_norm": 4.840903526928681,
"learning_rate": 9.907434966217041e-06,
"loss": 0.2658,
"step": 73
},
{
"epoch": 0.12436974789915967,
"grad_norm": 4.515748230482176,
"learning_rate": 9.904889693536475e-06,
"loss": 0.1715,
"step": 74
},
{
"epoch": 0.12605042016806722,
"grad_norm": 4.576112978323743,
"learning_rate": 9.902310235916435e-06,
"loss": 0.2407,
"step": 75
},
{
"epoch": 0.12773109243697478,
"grad_norm": 4.073187031843224,
"learning_rate": 9.899696611334612e-06,
"loss": 0.1885,
"step": 76
},
{
"epoch": 0.12941176470588237,
"grad_norm": 4.837076708284357,
"learning_rate": 9.89704883800683e-06,
"loss": 0.2429,
"step": 77
},
{
"epoch": 0.13109243697478992,
"grad_norm": 4.608602122041852,
"learning_rate": 9.894366934386913e-06,
"loss": 0.1704,
"step": 78
},
{
"epoch": 0.13277310924369748,
"grad_norm": 5.570923387271547,
"learning_rate": 9.891650919166558e-06,
"loss": 0.2605,
"step": 79
},
{
"epoch": 0.13445378151260504,
"grad_norm": 4.528798317968409,
"learning_rate": 9.888900811275205e-06,
"loss": 0.2563,
"step": 80
},
{
"epoch": 0.1361344537815126,
"grad_norm": 4.842711689139884,
"learning_rate": 9.886116629879906e-06,
"loss": 0.2596,
"step": 81
},
{
"epoch": 0.13781512605042018,
"grad_norm": 4.7561000330794645,
"learning_rate": 9.883298394385186e-06,
"loss": 0.1933,
"step": 82
},
{
"epoch": 0.13949579831932774,
"grad_norm": 5.009176440097795,
"learning_rate": 9.880446124432921e-06,
"loss": 0.2083,
"step": 83
},
{
"epoch": 0.1411764705882353,
"grad_norm": 5.331779721460354,
"learning_rate": 9.877559839902185e-06,
"loss": 0.2831,
"step": 84
},
{
"epoch": 0.14285714285714285,
"grad_norm": 4.460326910194911,
"learning_rate": 9.874639560909118e-06,
"loss": 0.2279,
"step": 85
},
{
"epoch": 0.14453781512605043,
"grad_norm": 4.971242931396187,
"learning_rate": 9.871685307806796e-06,
"loss": 0.2144,
"step": 86
},
{
"epoch": 0.146218487394958,
"grad_norm": 4.941549501376583,
"learning_rate": 9.868697101185066e-06,
"loss": 0.238,
"step": 87
},
{
"epoch": 0.14789915966386555,
"grad_norm": 4.837604687905309,
"learning_rate": 9.865674961870428e-06,
"loss": 0.1736,
"step": 88
},
{
"epoch": 0.1495798319327731,
"grad_norm": 4.301169486478077,
"learning_rate": 9.862618910925873e-06,
"loss": 0.1791,
"step": 89
},
{
"epoch": 0.15126050420168066,
"grad_norm": 4.419011396958963,
"learning_rate": 9.859528969650739e-06,
"loss": 0.2424,
"step": 90
},
{
"epoch": 0.15294117647058825,
"grad_norm": 4.439104272537347,
"learning_rate": 9.85640515958057e-06,
"loss": 0.1947,
"step": 91
},
{
"epoch": 0.1546218487394958,
"grad_norm": 4.1606384288910006,
"learning_rate": 9.853247502486957e-06,
"loss": 0.2373,
"step": 92
},
{
"epoch": 0.15630252100840336,
"grad_norm": 4.426995034165966,
"learning_rate": 9.850056020377392e-06,
"loss": 0.1984,
"step": 93
},
{
"epoch": 0.15798319327731092,
"grad_norm": 4.747800742019179,
"learning_rate": 9.846830735495112e-06,
"loss": 0.2304,
"step": 94
},
{
"epoch": 0.15966386554621848,
"grad_norm": 4.325848335124482,
"learning_rate": 9.843571670318943e-06,
"loss": 0.2418,
"step": 95
},
{
"epoch": 0.16134453781512606,
"grad_norm": 5.066068637598227,
"learning_rate": 9.840278847563147e-06,
"loss": 0.3608,
"step": 96
},
{
"epoch": 0.16302521008403362,
"grad_norm": 3.861333513668538,
"learning_rate": 9.836952290177261e-06,
"loss": 0.1868,
"step": 97
},
{
"epoch": 0.16470588235294117,
"grad_norm": 3.9561522011266343,
"learning_rate": 9.833592021345938e-06,
"loss": 0.2093,
"step": 98
},
{
"epoch": 0.16638655462184873,
"grad_norm": 4.132131484116752,
"learning_rate": 9.830198064488783e-06,
"loss": 0.2068,
"step": 99
},
{
"epoch": 0.16806722689075632,
"grad_norm": 3.9580220367981513,
"learning_rate": 9.826770443260193e-06,
"loss": 0.1995,
"step": 100
},
{
"epoch": 0.16974789915966387,
"grad_norm": 4.264099752452373,
"learning_rate": 9.823309181549194e-06,
"loss": 0.2047,
"step": 101
},
{
"epoch": 0.17142857142857143,
"grad_norm": 4.954717584301421,
"learning_rate": 9.819814303479268e-06,
"loss": 0.2233,
"step": 102
},
{
"epoch": 0.173109243697479,
"grad_norm": 4.616243415887144,
"learning_rate": 9.816285833408185e-06,
"loss": 0.2475,
"step": 103
},
{
"epoch": 0.17478991596638654,
"grad_norm": 4.489313598265477,
"learning_rate": 9.812723795927848e-06,
"loss": 0.2042,
"step": 104
},
{
"epoch": 0.17647058823529413,
"grad_norm": 4.891274285476457,
"learning_rate": 9.809128215864096e-06,
"loss": 0.2188,
"step": 105
},
{
"epoch": 0.1781512605042017,
"grad_norm": 4.479381566640569,
"learning_rate": 9.805499118276555e-06,
"loss": 0.2009,
"step": 106
},
{
"epoch": 0.17983193277310924,
"grad_norm": 4.492071651356979,
"learning_rate": 9.801836528458453e-06,
"loss": 0.2032,
"step": 107
},
{
"epoch": 0.1815126050420168,
"grad_norm": 3.9572668817717425,
"learning_rate": 9.798140471936437e-06,
"loss": 0.1341,
"step": 108
},
{
"epoch": 0.18319327731092436,
"grad_norm": 5.227775544340066,
"learning_rate": 9.79441097447041e-06,
"loss": 0.2361,
"step": 109
},
{
"epoch": 0.18487394957983194,
"grad_norm": 4.480776893524611,
"learning_rate": 9.790648062053341e-06,
"loss": 0.223,
"step": 110
},
{
"epoch": 0.1865546218487395,
"grad_norm": 3.9849522745760977,
"learning_rate": 9.786851760911084e-06,
"loss": 0.1797,
"step": 111
},
{
"epoch": 0.18823529411764706,
"grad_norm": 3.765063799769012,
"learning_rate": 9.783022097502204e-06,
"loss": 0.2076,
"step": 112
},
{
"epoch": 0.1899159663865546,
"grad_norm": 5.652920919469918,
"learning_rate": 9.779159098517781e-06,
"loss": 0.298,
"step": 113
},
{
"epoch": 0.1915966386554622,
"grad_norm": 4.4105882329875135,
"learning_rate": 9.77526279088123e-06,
"loss": 0.2307,
"step": 114
},
{
"epoch": 0.19327731092436976,
"grad_norm": 5.721726388390177,
"learning_rate": 9.771333201748116e-06,
"loss": 0.2443,
"step": 115
},
{
"epoch": 0.1949579831932773,
"grad_norm": 3.9054517446447123,
"learning_rate": 9.767370358505958e-06,
"loss": 0.1683,
"step": 116
},
{
"epoch": 0.19663865546218487,
"grad_norm": 5.2393244986377,
"learning_rate": 9.763374288774043e-06,
"loss": 0.2234,
"step": 117
},
{
"epoch": 0.19831932773109243,
"grad_norm": 4.116682183617701,
"learning_rate": 9.759345020403233e-06,
"loss": 0.2216,
"step": 118
},
{
"epoch": 0.2,
"grad_norm": 4.2674607287039406,
"learning_rate": 9.755282581475769e-06,
"loss": 0.1931,
"step": 119
},
{
"epoch": 0.20168067226890757,
"grad_norm": 3.6201843953883284,
"learning_rate": 9.751187000305076e-06,
"loss": 0.186,
"step": 120
},
{
"epoch": 0.20336134453781513,
"grad_norm": 4.970181582570933,
"learning_rate": 9.747058305435566e-06,
"loss": 0.2231,
"step": 121
},
{
"epoch": 0.20504201680672268,
"grad_norm": 4.6456388163364295,
"learning_rate": 9.742896525642442e-06,
"loss": 0.2568,
"step": 122
},
{
"epoch": 0.20672268907563024,
"grad_norm": 4.972251282923708,
"learning_rate": 9.738701689931488e-06,
"loss": 0.2637,
"step": 123
},
{
"epoch": 0.20840336134453782,
"grad_norm": 4.423134442557392,
"learning_rate": 9.734473827538881e-06,
"loss": 0.2398,
"step": 124
},
{
"epoch": 0.21008403361344538,
"grad_norm": 4.560715987612911,
"learning_rate": 9.730212967930974e-06,
"loss": 0.2274,
"step": 125
},
{
"epoch": 0.21176470588235294,
"grad_norm": 3.5433880225975254,
"learning_rate": 9.7259191408041e-06,
"loss": 0.173,
"step": 126
},
{
"epoch": 0.2134453781512605,
"grad_norm": 5.148310128666882,
"learning_rate": 9.721592376084355e-06,
"loss": 0.2769,
"step": 127
},
{
"epoch": 0.21512605042016808,
"grad_norm": 4.795984062191726,
"learning_rate": 9.717232703927402e-06,
"loss": 0.2244,
"step": 128
},
{
"epoch": 0.21680672268907564,
"grad_norm": 5.700619371572016,
"learning_rate": 9.712840154718253e-06,
"loss": 0.2338,
"step": 129
},
{
"epoch": 0.2184873949579832,
"grad_norm": 4.669225751974187,
"learning_rate": 9.70841475907106e-06,
"loss": 0.1966,
"step": 130
},
{
"epoch": 0.22016806722689075,
"grad_norm": 3.6626918920905878,
"learning_rate": 9.703956547828893e-06,
"loss": 0.1954,
"step": 131
},
{
"epoch": 0.2218487394957983,
"grad_norm": 5.236075056300641,
"learning_rate": 9.69946555206354e-06,
"loss": 0.1988,
"step": 132
},
{
"epoch": 0.2235294117647059,
"grad_norm": 4.69030459950783,
"learning_rate": 9.694941803075285e-06,
"loss": 0.2693,
"step": 133
},
{
"epoch": 0.22521008403361345,
"grad_norm": 3.955969407559595,
"learning_rate": 9.690385332392676e-06,
"loss": 0.2082,
"step": 134
},
{
"epoch": 0.226890756302521,
"grad_norm": 4.177067871929489,
"learning_rate": 9.685796171772327e-06,
"loss": 0.2962,
"step": 135
},
{
"epoch": 0.22857142857142856,
"grad_norm": 4.167190069041118,
"learning_rate": 9.681174353198687e-06,
"loss": 0.2331,
"step": 136
},
{
"epoch": 0.23025210084033612,
"grad_norm": 4.225274226603427,
"learning_rate": 9.67651990888381e-06,
"loss": 0.1906,
"step": 137
},
{
"epoch": 0.2319327731092437,
"grad_norm": 4.197817430164727,
"learning_rate": 9.67183287126714e-06,
"loss": 0.2023,
"step": 138
},
{
"epoch": 0.23361344537815126,
"grad_norm": 3.935578613508391,
"learning_rate": 9.667113273015283e-06,
"loss": 0.2062,
"step": 139
},
{
"epoch": 0.23529411764705882,
"grad_norm": 4.605647036736872,
"learning_rate": 9.66236114702178e-06,
"loss": 0.2104,
"step": 140
},
{
"epoch": 0.23697478991596638,
"grad_norm": 4.341408968768613,
"learning_rate": 9.657576526406872e-06,
"loss": 0.1792,
"step": 141
},
{
"epoch": 0.23865546218487396,
"grad_norm": 4.754632418091511,
"learning_rate": 9.652759444517276e-06,
"loss": 0.212,
"step": 142
},
{
"epoch": 0.24033613445378152,
"grad_norm": 4.013047554379441,
"learning_rate": 9.647909934925952e-06,
"loss": 0.2134,
"step": 143
},
{
"epoch": 0.24201680672268908,
"grad_norm": 4.3593189912939945,
"learning_rate": 9.64302803143186e-06,
"loss": 0.184,
"step": 144
},
{
"epoch": 0.24369747899159663,
"grad_norm": 4.378110003430036,
"learning_rate": 9.63811376805974e-06,
"loss": 0.2012,
"step": 145
},
{
"epoch": 0.2453781512605042,
"grad_norm": 4.828592016527972,
"learning_rate": 9.633167179059859e-06,
"loss": 0.2797,
"step": 146
},
{
"epoch": 0.24705882352941178,
"grad_norm": 3.891371257775755,
"learning_rate": 9.628188298907782e-06,
"loss": 0.1788,
"step": 147
},
{
"epoch": 0.24873949579831933,
"grad_norm": 5.6001909540395065,
"learning_rate": 9.623177162304132e-06,
"loss": 0.2722,
"step": 148
},
{
"epoch": 0.2504201680672269,
"grad_norm": 4.193146001915105,
"learning_rate": 9.618133804174341e-06,
"loss": 0.2035,
"step": 149
},
{
"epoch": 0.25210084033613445,
"grad_norm": 4.325524865328703,
"learning_rate": 9.613058259668416e-06,
"loss": 0.2327,
"step": 150
},
{
"epoch": 0.253781512605042,
"grad_norm": 5.602899385924722,
"learning_rate": 9.607950564160682e-06,
"loss": 0.2548,
"step": 151
},
{
"epoch": 0.25546218487394956,
"grad_norm": 4.129341895449091,
"learning_rate": 9.602810753249549e-06,
"loss": 0.1687,
"step": 152
},
{
"epoch": 0.2571428571428571,
"grad_norm": 4.056125421715808,
"learning_rate": 9.597638862757255e-06,
"loss": 0.1439,
"step": 153
},
{
"epoch": 0.25882352941176473,
"grad_norm": 4.625817847491738,
"learning_rate": 9.592434928729617e-06,
"loss": 0.2313,
"step": 154
},
{
"epoch": 0.2605042016806723,
"grad_norm": 5.159432944923756,
"learning_rate": 9.587198987435782e-06,
"loss": 0.2314,
"step": 155
},
{
"epoch": 0.26218487394957984,
"grad_norm": 4.821801696820484,
"learning_rate": 9.581931075367979e-06,
"loss": 0.2598,
"step": 156
},
{
"epoch": 0.2638655462184874,
"grad_norm": 4.864810002494065,
"learning_rate": 9.576631229241248e-06,
"loss": 0.2023,
"step": 157
},
{
"epoch": 0.26554621848739496,
"grad_norm": 4.706045710445991,
"learning_rate": 9.57129948599321e-06,
"loss": 0.2339,
"step": 158
},
{
"epoch": 0.2672268907563025,
"grad_norm": 4.712230200754653,
"learning_rate": 9.565935882783784e-06,
"loss": 0.2565,
"step": 159
},
{
"epoch": 0.2689075630252101,
"grad_norm": 5.822625035219728,
"learning_rate": 9.56054045699494e-06,
"loss": 0.2678,
"step": 160
},
{
"epoch": 0.27058823529411763,
"grad_norm": 4.050112712888665,
"learning_rate": 9.555113246230443e-06,
"loss": 0.2299,
"step": 161
},
{
"epoch": 0.2722689075630252,
"grad_norm": 4.070314071680027,
"learning_rate": 9.54965428831558e-06,
"loss": 0.2794,
"step": 162
},
{
"epoch": 0.2739495798319328,
"grad_norm": 4.036666909908974,
"learning_rate": 9.544163621296906e-06,
"loss": 0.2553,
"step": 163
},
{
"epoch": 0.27563025210084036,
"grad_norm": 4.617935809136051,
"learning_rate": 9.538641283441974e-06,
"loss": 0.2972,
"step": 164
},
{
"epoch": 0.2773109243697479,
"grad_norm": 3.5010006509743867,
"learning_rate": 9.533087313239065e-06,
"loss": 0.181,
"step": 165
},
{
"epoch": 0.27899159663865547,
"grad_norm": 4.62451295112266,
"learning_rate": 9.527501749396924e-06,
"loss": 0.2186,
"step": 166
},
{
"epoch": 0.280672268907563,
"grad_norm": 4.058513129530688,
"learning_rate": 9.521884630844498e-06,
"loss": 0.24,
"step": 167
},
{
"epoch": 0.2823529411764706,
"grad_norm": 4.1500543346518155,
"learning_rate": 9.516235996730645e-06,
"loss": 0.2403,
"step": 168
},
{
"epoch": 0.28403361344537814,
"grad_norm": 3.7555979901592376,
"learning_rate": 9.510555886423883e-06,
"loss": 0.154,
"step": 169
},
{
"epoch": 0.2857142857142857,
"grad_norm": 4.78167958275682,
"learning_rate": 9.504844339512096e-06,
"loss": 0.2789,
"step": 170
},
{
"epoch": 0.28739495798319326,
"grad_norm": 5.370146109187836,
"learning_rate": 9.499101395802277e-06,
"loss": 0.2698,
"step": 171
},
{
"epoch": 0.28907563025210087,
"grad_norm": 5.169364964187269,
"learning_rate": 9.493327095320231e-06,
"loss": 0.2108,
"step": 172
},
{
"epoch": 0.2907563025210084,
"grad_norm": 4.469107375219695,
"learning_rate": 9.487521478310316e-06,
"loss": 0.2703,
"step": 173
},
{
"epoch": 0.292436974789916,
"grad_norm": 4.21829425061834,
"learning_rate": 9.481684585235145e-06,
"loss": 0.1538,
"step": 174
},
{
"epoch": 0.29411764705882354,
"grad_norm": 4.194377947803058,
"learning_rate": 9.475816456775313e-06,
"loss": 0.2168,
"step": 175
},
{
"epoch": 0.2957983193277311,
"grad_norm": 4.513615580315599,
"learning_rate": 9.469917133829114e-06,
"loss": 0.2384,
"step": 176
},
{
"epoch": 0.29747899159663865,
"grad_norm": 3.475010035423131,
"learning_rate": 9.463986657512254e-06,
"loss": 0.1856,
"step": 177
},
{
"epoch": 0.2991596638655462,
"grad_norm": 3.9726843146403574,
"learning_rate": 9.458025069157563e-06,
"loss": 0.2072,
"step": 178
},
{
"epoch": 0.30084033613445377,
"grad_norm": 3.13300922477254,
"learning_rate": 9.452032410314709e-06,
"loss": 0.1727,
"step": 179
},
{
"epoch": 0.3025210084033613,
"grad_norm": 3.723460543351599,
"learning_rate": 9.446008722749906e-06,
"loss": 0.1676,
"step": 180
},
{
"epoch": 0.3042016806722689,
"grad_norm": 4.528548649597003,
"learning_rate": 9.439954048445628e-06,
"loss": 0.2251,
"step": 181
},
{
"epoch": 0.3058823529411765,
"grad_norm": 3.557373366966036,
"learning_rate": 9.43386842960031e-06,
"loss": 0.206,
"step": 182
},
{
"epoch": 0.30756302521008405,
"grad_norm": 3.42214545293451,
"learning_rate": 9.427751908628059e-06,
"loss": 0.1895,
"step": 183
},
{
"epoch": 0.3092436974789916,
"grad_norm": 4.515404571418755,
"learning_rate": 9.421604528158355e-06,
"loss": 0.2641,
"step": 184
},
{
"epoch": 0.31092436974789917,
"grad_norm": 4.926637503494079,
"learning_rate": 9.415426331035754e-06,
"loss": 0.2524,
"step": 185
},
{
"epoch": 0.3126050420168067,
"grad_norm": 4.383497390044887,
"learning_rate": 9.409217360319594e-06,
"loss": 0.238,
"step": 186
},
{
"epoch": 0.3142857142857143,
"grad_norm": 4.234792938224042,
"learning_rate": 9.40297765928369e-06,
"loss": 0.2292,
"step": 187
},
{
"epoch": 0.31596638655462184,
"grad_norm": 4.481631640061522,
"learning_rate": 9.396707271416035e-06,
"loss": 0.3017,
"step": 188
},
{
"epoch": 0.3176470588235294,
"grad_norm": 5.256316840677135,
"learning_rate": 9.39040624041849e-06,
"loss": 0.2403,
"step": 189
},
{
"epoch": 0.31932773109243695,
"grad_norm": 3.781091993143701,
"learning_rate": 9.384074610206495e-06,
"loss": 0.1869,
"step": 190
},
{
"epoch": 0.32100840336134456,
"grad_norm": 4.6057017882339615,
"learning_rate": 9.377712424908743e-06,
"loss": 0.2507,
"step": 191
},
{
"epoch": 0.3226890756302521,
"grad_norm": 6.461861467319499,
"learning_rate": 9.371319728866892e-06,
"loss": 0.281,
"step": 192
},
{
"epoch": 0.3243697478991597,
"grad_norm": 4.800015707877198,
"learning_rate": 9.36489656663524e-06,
"loss": 0.2458,
"step": 193
},
{
"epoch": 0.32605042016806723,
"grad_norm": 4.227032338998176,
"learning_rate": 9.35844298298042e-06,
"loss": 0.2197,
"step": 194
},
{
"epoch": 0.3277310924369748,
"grad_norm": 4.816215636974397,
"learning_rate": 9.351959022881098e-06,
"loss": 0.2489,
"step": 195
},
{
"epoch": 0.32941176470588235,
"grad_norm": 4.886220251164849,
"learning_rate": 9.345444731527642e-06,
"loss": 0.1837,
"step": 196
},
{
"epoch": 0.3310924369747899,
"grad_norm": 5.191082310263523,
"learning_rate": 9.338900154321818e-06,
"loss": 0.206,
"step": 197
},
{
"epoch": 0.33277310924369746,
"grad_norm": 3.617055755969013,
"learning_rate": 9.332325336876472e-06,
"loss": 0.1452,
"step": 198
},
{
"epoch": 0.334453781512605,
"grad_norm": 4.226640548699603,
"learning_rate": 9.325720325015211e-06,
"loss": 0.224,
"step": 199
},
{
"epoch": 0.33613445378151263,
"grad_norm": 4.513807147293619,
"learning_rate": 9.319085164772082e-06,
"loss": 0.2354,
"step": 200
},
{
"epoch": 0.3378151260504202,
"grad_norm": 4.077629119418003,
"learning_rate": 9.312419902391256e-06,
"loss": 0.2192,
"step": 201
},
{
"epoch": 0.33949579831932775,
"grad_norm": 4.470110460616241,
"learning_rate": 9.305724584326702e-06,
"loss": 0.182,
"step": 202
},
{
"epoch": 0.3411764705882353,
"grad_norm": 4.680846295566748,
"learning_rate": 9.298999257241862e-06,
"loss": 0.2393,
"step": 203
},
{
"epoch": 0.34285714285714286,
"grad_norm": 4.526839194923339,
"learning_rate": 9.292243968009332e-06,
"loss": 0.1923,
"step": 204
},
{
"epoch": 0.3445378151260504,
"grad_norm": 4.538887920332639,
"learning_rate": 9.285458763710524e-06,
"loss": 0.2485,
"step": 205
},
{
"epoch": 0.346218487394958,
"grad_norm": 5.593301905843468,
"learning_rate": 9.278643691635352e-06,
"loss": 0.2729,
"step": 206
},
{
"epoch": 0.34789915966386553,
"grad_norm": 4.928678837563507,
"learning_rate": 9.271798799281893e-06,
"loss": 0.2701,
"step": 207
},
{
"epoch": 0.3495798319327731,
"grad_norm": 4.211161086011563,
"learning_rate": 9.264924134356057e-06,
"loss": 0.2218,
"step": 208
},
{
"epoch": 0.35126050420168065,
"grad_norm": 4.619486458421115,
"learning_rate": 9.258019744771256e-06,
"loss": 0.2004,
"step": 209
},
{
"epoch": 0.35294117647058826,
"grad_norm": 3.602112878149427,
"learning_rate": 9.251085678648072e-06,
"loss": 0.2173,
"step": 210
},
{
"epoch": 0.3546218487394958,
"grad_norm": 3.7653597093168556,
"learning_rate": 9.244121984313916e-06,
"loss": 0.1888,
"step": 211
},
{
"epoch": 0.3563025210084034,
"grad_norm": 5.432992211487408,
"learning_rate": 9.2371287103027e-06,
"loss": 0.274,
"step": 212
},
{
"epoch": 0.35798319327731093,
"grad_norm": 4.246141250922947,
"learning_rate": 9.23010590535449e-06,
"loss": 0.2673,
"step": 213
},
{
"epoch": 0.3596638655462185,
"grad_norm": 5.33566750421804,
"learning_rate": 9.223053618415168e-06,
"loss": 0.2074,
"step": 214
},
{
"epoch": 0.36134453781512604,
"grad_norm": 4.283572988780274,
"learning_rate": 9.215971898636094e-06,
"loss": 0.2254,
"step": 215
},
{
"epoch": 0.3630252100840336,
"grad_norm": 4.561348305944251,
"learning_rate": 9.208860795373765e-06,
"loss": 0.2254,
"step": 216
},
{
"epoch": 0.36470588235294116,
"grad_norm": 4.9036278330461665,
"learning_rate": 9.201720358189464e-06,
"loss": 0.2071,
"step": 217
},
{
"epoch": 0.3663865546218487,
"grad_norm": 4.635203208682745,
"learning_rate": 9.194550636848923e-06,
"loss": 0.2038,
"step": 218
},
{
"epoch": 0.3680672268907563,
"grad_norm": 4.19385346288178,
"learning_rate": 9.187351681321965e-06,
"loss": 0.2382,
"step": 219
},
{
"epoch": 0.3697478991596639,
"grad_norm": 3.8373061734988623,
"learning_rate": 9.180123541782172e-06,
"loss": 0.1818,
"step": 220
},
{
"epoch": 0.37142857142857144,
"grad_norm": 3.9983995742701466,
"learning_rate": 9.172866268606514e-06,
"loss": 0.2239,
"step": 221
},
{
"epoch": 0.373109243697479,
"grad_norm": 4.096523467317054,
"learning_rate": 9.16557991237502e-06,
"loss": 0.1967,
"step": 222
},
{
"epoch": 0.37478991596638656,
"grad_norm": 3.9949804148348953,
"learning_rate": 9.158264523870413e-06,
"loss": 0.2539,
"step": 223
},
{
"epoch": 0.3764705882352941,
"grad_norm": 3.4121379024957426,
"learning_rate": 9.150920154077753e-06,
"loss": 0.1659,
"step": 224
},
{
"epoch": 0.37815126050420167,
"grad_norm": 4.223999169456629,
"learning_rate": 9.143546854184095e-06,
"loss": 0.1866,
"step": 225
},
{
"epoch": 0.3798319327731092,
"grad_norm": 4.4462570365988885,
"learning_rate": 9.136144675578114e-06,
"loss": 0.2234,
"step": 226
},
{
"epoch": 0.3815126050420168,
"grad_norm": 5.937282515027506,
"learning_rate": 9.128713669849767e-06,
"loss": 0.2484,
"step": 227
},
{
"epoch": 0.3831932773109244,
"grad_norm": 3.970889487846386,
"learning_rate": 9.121253888789916e-06,
"loss": 0.184,
"step": 228
},
{
"epoch": 0.38487394957983195,
"grad_norm": 3.4532808484924638,
"learning_rate": 9.113765384389984e-06,
"loss": 0.1931,
"step": 229
},
{
"epoch": 0.3865546218487395,
"grad_norm": 3.296349607830855,
"learning_rate": 9.106248208841568e-06,
"loss": 0.1928,
"step": 230
},
{
"epoch": 0.38823529411764707,
"grad_norm": 3.5827334306096548,
"learning_rate": 9.098702414536107e-06,
"loss": 0.1947,
"step": 231
},
{
"epoch": 0.3899159663865546,
"grad_norm": 4.419565381593234,
"learning_rate": 9.091128054064487e-06,
"loss": 0.1988,
"step": 232
},
{
"epoch": 0.3915966386554622,
"grad_norm": 4.629227753836929,
"learning_rate": 9.083525180216697e-06,
"loss": 0.2783,
"step": 233
},
{
"epoch": 0.39327731092436974,
"grad_norm": 4.780093359100441,
"learning_rate": 9.075893845981445e-06,
"loss": 0.1882,
"step": 234
},
{
"epoch": 0.3949579831932773,
"grad_norm": 4.028034546146023,
"learning_rate": 9.0682341045458e-06,
"loss": 0.2319,
"step": 235
},
{
"epoch": 0.39663865546218485,
"grad_norm": 3.8750364011106884,
"learning_rate": 9.060546009294818e-06,
"loss": 0.2066,
"step": 236
},
{
"epoch": 0.3983193277310924,
"grad_norm": 4.158470486371845,
"learning_rate": 9.05282961381116e-06,
"loss": 0.2507,
"step": 237
},
{
"epoch": 0.4,
"grad_norm": 4.18280526847092,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2264,
"step": 238
},
{
"epoch": 0.4016806722689076,
"grad_norm": 3.899581582099338,
"learning_rate": 9.037312137462323e-06,
"loss": 0.1807,
"step": 239
},
{
"epoch": 0.40336134453781514,
"grad_norm": 3.790705042738854,
"learning_rate": 9.029511164747175e-06,
"loss": 0.1991,
"step": 240
},
{
"epoch": 0.4050420168067227,
"grad_norm": 3.9082019683768157,
"learning_rate": 9.021682108098671e-06,
"loss": 0.195,
"step": 241
},
{
"epoch": 0.40672268907563025,
"grad_norm": 4.988301662674425,
"learning_rate": 9.013825022081915e-06,
"loss": 0.2016,
"step": 242
},
{
"epoch": 0.4084033613445378,
"grad_norm": 4.5339550958204855,
"learning_rate": 9.005939961457366e-06,
"loss": 0.2199,
"step": 243
},
{
"epoch": 0.41008403361344536,
"grad_norm": 3.8323113686251644,
"learning_rate": 8.998026981180454e-06,
"loss": 0.19,
"step": 244
},
{
"epoch": 0.4117647058823529,
"grad_norm": 4.253369786543169,
"learning_rate": 8.990086136401199e-06,
"loss": 0.2044,
"step": 245
},
{
"epoch": 0.4134453781512605,
"grad_norm": 3.9026309891957736,
"learning_rate": 8.982117482463817e-06,
"loss": 0.2221,
"step": 246
},
{
"epoch": 0.4151260504201681,
"grad_norm": 3.9695322564079674,
"learning_rate": 8.97412107490635e-06,
"loss": 0.22,
"step": 247
},
{
"epoch": 0.41680672268907565,
"grad_norm": 4.967876667155862,
"learning_rate": 8.966096969460263e-06,
"loss": 0.2525,
"step": 248
},
{
"epoch": 0.4184873949579832,
"grad_norm": 4.060825678979327,
"learning_rate": 8.958045222050073e-06,
"loss": 0.1935,
"step": 249
},
{
"epoch": 0.42016806722689076,
"grad_norm": 3.973788704821746,
"learning_rate": 8.94996588879294e-06,
"loss": 0.2171,
"step": 250
},
{
"epoch": 0.4218487394957983,
"grad_norm": 4.090322297929094,
"learning_rate": 8.94185902599829e-06,
"loss": 0.2173,
"step": 251
},
{
"epoch": 0.4235294117647059,
"grad_norm": 4.503759124581851,
"learning_rate": 8.933724690167417e-06,
"loss": 0.2442,
"step": 252
},
{
"epoch": 0.42521008403361343,
"grad_norm": 4.46443714035923,
"learning_rate": 8.92556293799309e-06,
"loss": 0.2254,
"step": 253
},
{
"epoch": 0.426890756302521,
"grad_norm": 4.801328310938415,
"learning_rate": 8.917373826359156e-06,
"loss": 0.2286,
"step": 254
},
{
"epoch": 0.42857142857142855,
"grad_norm": 4.106180470850406,
"learning_rate": 8.90915741234015e-06,
"loss": 0.2321,
"step": 255
},
{
"epoch": 0.43025210084033616,
"grad_norm": 4.105749099535154,
"learning_rate": 8.900913753200887e-06,
"loss": 0.2081,
"step": 256
},
{
"epoch": 0.4319327731092437,
"grad_norm": 4.043544008475371,
"learning_rate": 8.892642906396076e-06,
"loss": 0.2189,
"step": 257
},
{
"epoch": 0.4336134453781513,
"grad_norm": 4.095210376362221,
"learning_rate": 8.884344929569905e-06,
"loss": 0.2413,
"step": 258
},
{
"epoch": 0.43529411764705883,
"grad_norm": 3.341752408472611,
"learning_rate": 8.87601988055565e-06,
"loss": 0.1966,
"step": 259
},
{
"epoch": 0.4369747899159664,
"grad_norm": 4.364023975969356,
"learning_rate": 8.867667817375266e-06,
"loss": 0.2762,
"step": 260
},
{
"epoch": 0.43865546218487395,
"grad_norm": 5.054813922266153,
"learning_rate": 8.859288798238988e-06,
"loss": 0.2721,
"step": 261
},
{
"epoch": 0.4403361344537815,
"grad_norm": 3.2448771175239903,
"learning_rate": 8.850882881544923e-06,
"loss": 0.1857,
"step": 262
},
{
"epoch": 0.44201680672268906,
"grad_norm": 3.5338228285569224,
"learning_rate": 8.842450125878634e-06,
"loss": 0.1913,
"step": 263
},
{
"epoch": 0.4436974789915966,
"grad_norm": 4.138571886719138,
"learning_rate": 8.833990590012749e-06,
"loss": 0.2272,
"step": 264
},
{
"epoch": 0.44537815126050423,
"grad_norm": 4.40959325327376,
"learning_rate": 8.825504332906542e-06,
"loss": 0.2235,
"step": 265
},
{
"epoch": 0.4470588235294118,
"grad_norm": 5.463965171515439,
"learning_rate": 8.816991413705515e-06,
"loss": 0.2681,
"step": 266
},
{
"epoch": 0.44873949579831934,
"grad_norm": 4.258513743820523,
"learning_rate": 8.808451891741001e-06,
"loss": 0.1973,
"step": 267
},
{
"epoch": 0.4504201680672269,
"grad_norm": 5.106159542200387,
"learning_rate": 8.799885826529736e-06,
"loss": 0.2918,
"step": 268
},
{
"epoch": 0.45210084033613446,
"grad_norm": 3.4701298119834476,
"learning_rate": 8.79129327777346e-06,
"loss": 0.2155,
"step": 269
},
{
"epoch": 0.453781512605042,
"grad_norm": 3.7529119643338014,
"learning_rate": 8.782674305358481e-06,
"loss": 0.2887,
"step": 270
},
{
"epoch": 0.45546218487394957,
"grad_norm": 5.682941483106064,
"learning_rate": 8.774028969355273e-06,
"loss": 0.2483,
"step": 271
},
{
"epoch": 0.45714285714285713,
"grad_norm": 4.058019802098144,
"learning_rate": 8.765357330018056e-06,
"loss": 0.2492,
"step": 272
},
{
"epoch": 0.4588235294117647,
"grad_norm": 4.641783192950925,
"learning_rate": 8.756659447784367e-06,
"loss": 0.235,
"step": 273
},
{
"epoch": 0.46050420168067224,
"grad_norm": 4.648930577610503,
"learning_rate": 8.74793538327465e-06,
"loss": 0.2723,
"step": 274
},
{
"epoch": 0.46218487394957986,
"grad_norm": 3.6265958946164676,
"learning_rate": 8.739185197291824e-06,
"loss": 0.1753,
"step": 275
},
{
"epoch": 0.4638655462184874,
"grad_norm": 4.151010075520344,
"learning_rate": 8.730408950820864e-06,
"loss": 0.2342,
"step": 276
},
{
"epoch": 0.46554621848739497,
"grad_norm": 3.9544139566060075,
"learning_rate": 8.721606705028376e-06,
"loss": 0.2369,
"step": 277
},
{
"epoch": 0.4672268907563025,
"grad_norm": 4.865583546199144,
"learning_rate": 8.71277852126217e-06,
"loss": 0.2174,
"step": 278
},
{
"epoch": 0.4689075630252101,
"grad_norm": 3.7227278170113167,
"learning_rate": 8.703924461050832e-06,
"loss": 0.2652,
"step": 279
},
{
"epoch": 0.47058823529411764,
"grad_norm": 3.9404090239139413,
"learning_rate": 8.695044586103297e-06,
"loss": 0.1988,
"step": 280
},
{
"epoch": 0.4722689075630252,
"grad_norm": 3.8358691946565355,
"learning_rate": 8.686138958308415e-06,
"loss": 0.2018,
"step": 281
},
{
"epoch": 0.47394957983193275,
"grad_norm": 3.311037810132601,
"learning_rate": 8.67720763973452e-06,
"loss": 0.195,
"step": 282
},
{
"epoch": 0.4756302521008403,
"grad_norm": 3.5994550095049656,
"learning_rate": 8.668250692629008e-06,
"loss": 0.2216,
"step": 283
},
{
"epoch": 0.4773109243697479,
"grad_norm": 3.9949982493621166,
"learning_rate": 8.659268179417886e-06,
"loss": 0.2447,
"step": 284
},
{
"epoch": 0.4789915966386555,
"grad_norm": 5.049172465722346,
"learning_rate": 8.65026016270535e-06,
"loss": 0.2252,
"step": 285
},
{
"epoch": 0.48067226890756304,
"grad_norm": 3.7246219501155675,
"learning_rate": 8.641226705273344e-06,
"loss": 0.1994,
"step": 286
},
{
"epoch": 0.4823529411764706,
"grad_norm": 5.424735374280226,
"learning_rate": 8.632167870081122e-06,
"loss": 0.2268,
"step": 287
},
{
"epoch": 0.48403361344537815,
"grad_norm": 3.9698670375044944,
"learning_rate": 8.623083720264806e-06,
"loss": 0.1919,
"step": 288
},
{
"epoch": 0.4857142857142857,
"grad_norm": 6.189711620026336,
"learning_rate": 8.613974319136959e-06,
"loss": 0.2538,
"step": 289
},
{
"epoch": 0.48739495798319327,
"grad_norm": 4.139394655175779,
"learning_rate": 8.604839730186125e-06,
"loss": 0.1841,
"step": 290
},
{
"epoch": 0.4890756302521008,
"grad_norm": 4.255536063641137,
"learning_rate": 8.595680017076403e-06,
"loss": 0.2175,
"step": 291
},
{
"epoch": 0.4907563025210084,
"grad_norm": 3.4984785232169826,
"learning_rate": 8.586495243646992e-06,
"loss": 0.1834,
"step": 292
},
{
"epoch": 0.492436974789916,
"grad_norm": 3.9216701824988074,
"learning_rate": 8.577285473911753e-06,
"loss": 0.1737,
"step": 293
},
{
"epoch": 0.49411764705882355,
"grad_norm": 3.8780941059875027,
"learning_rate": 8.568050772058763e-06,
"loss": 0.186,
"step": 294
},
{
"epoch": 0.4957983193277311,
"grad_norm": 4.036911708332518,
"learning_rate": 8.558791202449857e-06,
"loss": 0.229,
"step": 295
},
{
"epoch": 0.49747899159663866,
"grad_norm": 3.514731462007391,
"learning_rate": 8.549506829620193e-06,
"loss": 0.1942,
"step": 296
},
{
"epoch": 0.4991596638655462,
"grad_norm": 3.2899615193670066,
"learning_rate": 8.540197718277797e-06,
"loss": 0.1781,
"step": 297
},
{
"epoch": 0.5008403361344538,
"grad_norm": 4.001548111037081,
"learning_rate": 8.530863933303108e-06,
"loss": 0.1553,
"step": 298
},
{
"epoch": 0.5025210084033613,
"grad_norm": 3.725806915044608,
"learning_rate": 8.521505539748535e-06,
"loss": 0.1946,
"step": 299
},
{
"epoch": 0.5042016806722689,
"grad_norm": 4.659889554742877,
"learning_rate": 8.512122602837993e-06,
"loss": 0.2254,
"step": 300
},
{
"epoch": 0.5058823529411764,
"grad_norm": 4.433725934195565,
"learning_rate": 8.502715187966455e-06,
"loss": 0.2236,
"step": 301
},
{
"epoch": 0.507563025210084,
"grad_norm": 3.699038449232604,
"learning_rate": 8.493283360699496e-06,
"loss": 0.1884,
"step": 302
},
{
"epoch": 0.5092436974789916,
"grad_norm": 4.3616517996529085,
"learning_rate": 8.483827186772832e-06,
"loss": 0.2594,
"step": 303
},
{
"epoch": 0.5109243697478991,
"grad_norm": 3.272531751768155,
"learning_rate": 8.47434673209187e-06,
"loss": 0.1864,
"step": 304
},
{
"epoch": 0.5126050420168067,
"grad_norm": 5.245759108142561,
"learning_rate": 8.464842062731235e-06,
"loss": 0.2004,
"step": 305
},
{
"epoch": 0.5142857142857142,
"grad_norm": 3.924276284348769,
"learning_rate": 8.455313244934324e-06,
"loss": 0.1824,
"step": 306
},
{
"epoch": 0.5159663865546219,
"grad_norm": 4.6395692867201195,
"learning_rate": 8.445760345112836e-06,
"loss": 0.2402,
"step": 307
},
{
"epoch": 0.5176470588235295,
"grad_norm": 3.5570683386402204,
"learning_rate": 8.436183429846314e-06,
"loss": 0.1809,
"step": 308
},
{
"epoch": 0.519327731092437,
"grad_norm": 4.102626184053465,
"learning_rate": 8.426582565881674e-06,
"loss": 0.2048,
"step": 309
},
{
"epoch": 0.5210084033613446,
"grad_norm": 3.52485735121433,
"learning_rate": 8.416957820132743e-06,
"loss": 0.163,
"step": 310
},
{
"epoch": 0.5226890756302521,
"grad_norm": 4.346874108094046,
"learning_rate": 8.407309259679801e-06,
"loss": 0.1943,
"step": 311
},
{
"epoch": 0.5243697478991597,
"grad_norm": 3.355148847177652,
"learning_rate": 8.397636951769099e-06,
"loss": 0.206,
"step": 312
},
{
"epoch": 0.5260504201680672,
"grad_norm": 3.0746648845052977,
"learning_rate": 8.387940963812398e-06,
"loss": 0.1799,
"step": 313
},
{
"epoch": 0.5277310924369748,
"grad_norm": 3.994292228485202,
"learning_rate": 8.378221363386506e-06,
"loss": 0.1758,
"step": 314
},
{
"epoch": 0.5294117647058824,
"grad_norm": 4.30723546706679,
"learning_rate": 8.368478218232787e-06,
"loss": 0.2426,
"step": 315
},
{
"epoch": 0.5310924369747899,
"grad_norm": 4.563931221264526,
"learning_rate": 8.358711596256712e-06,
"loss": 0.2205,
"step": 316
},
{
"epoch": 0.5327731092436975,
"grad_norm": 4.342112338014289,
"learning_rate": 8.348921565527373e-06,
"loss": 0.2655,
"step": 317
},
{
"epoch": 0.534453781512605,
"grad_norm": 4.148394609285132,
"learning_rate": 8.339108194277006e-06,
"loss": 0.1984,
"step": 318
},
{
"epoch": 0.5361344537815126,
"grad_norm": 5.008722667113153,
"learning_rate": 8.329271550900528e-06,
"loss": 0.2658,
"step": 319
},
{
"epoch": 0.5378151260504201,
"grad_norm": 3.841587828000924,
"learning_rate": 8.319411703955042e-06,
"loss": 0.2112,
"step": 320
},
{
"epoch": 0.5394957983193277,
"grad_norm": 4.50236826122661,
"learning_rate": 8.309528722159383e-06,
"loss": 0.2555,
"step": 321
},
{
"epoch": 0.5411764705882353,
"grad_norm": 4.710869349180975,
"learning_rate": 8.299622674393615e-06,
"loss": 0.2483,
"step": 322
},
{
"epoch": 0.5428571428571428,
"grad_norm": 3.557233194960392,
"learning_rate": 8.289693629698564e-06,
"loss": 0.2096,
"step": 323
},
{
"epoch": 0.5445378151260504,
"grad_norm": 3.766181124916138,
"learning_rate": 8.27974165727534e-06,
"loss": 0.2387,
"step": 324
},
{
"epoch": 0.5462184873949579,
"grad_norm": 5.327374692077757,
"learning_rate": 8.269766826484841e-06,
"loss": 0.2217,
"step": 325
},
{
"epoch": 0.5478991596638656,
"grad_norm": 6.34817507695616,
"learning_rate": 8.259769206847286e-06,
"loss": 0.2707,
"step": 326
},
{
"epoch": 0.5495798319327732,
"grad_norm": 4.381678895740831,
"learning_rate": 8.249748868041717e-06,
"loss": 0.2104,
"step": 327
},
{
"epoch": 0.5512605042016807,
"grad_norm": 3.998236552947582,
"learning_rate": 8.239705879905519e-06,
"loss": 0.2198,
"step": 328
},
{
"epoch": 0.5529411764705883,
"grad_norm": 4.213580958945381,
"learning_rate": 8.229640312433938e-06,
"loss": 0.1988,
"step": 329
},
{
"epoch": 0.5546218487394958,
"grad_norm": 3.759112597696243,
"learning_rate": 8.219552235779578e-06,
"loss": 0.1866,
"step": 330
},
{
"epoch": 0.5563025210084034,
"grad_norm": 3.3792729302736375,
"learning_rate": 8.209441720251934e-06,
"loss": 0.1907,
"step": 331
},
{
"epoch": 0.5579831932773109,
"grad_norm": 4.944461357096941,
"learning_rate": 8.199308836316883e-06,
"loss": 0.2319,
"step": 332
},
{
"epoch": 0.5596638655462185,
"grad_norm": 4.3235209989485925,
"learning_rate": 8.189153654596199e-06,
"loss": 0.1951,
"step": 333
},
{
"epoch": 0.561344537815126,
"grad_norm": 4.706476875198324,
"learning_rate": 8.178976245867068e-06,
"loss": 0.1995,
"step": 334
},
{
"epoch": 0.5630252100840336,
"grad_norm": 3.934741224212452,
"learning_rate": 8.168776681061583e-06,
"loss": 0.1704,
"step": 335
},
{
"epoch": 0.5647058823529412,
"grad_norm": 4.4892628919370425,
"learning_rate": 8.158555031266255e-06,
"loss": 0.2723,
"step": 336
},
{
"epoch": 0.5663865546218487,
"grad_norm": 4.718004820102291,
"learning_rate": 8.148311367721524e-06,
"loss": 0.221,
"step": 337
},
{
"epoch": 0.5680672268907563,
"grad_norm": 4.176984062196839,
"learning_rate": 8.138045761821252e-06,
"loss": 0.2361,
"step": 338
},
{
"epoch": 0.5697478991596638,
"grad_norm": 4.875232795035776,
"learning_rate": 8.127758285112226e-06,
"loss": 0.1787,
"step": 339
},
{
"epoch": 0.5714285714285714,
"grad_norm": 5.5937101005600365,
"learning_rate": 8.117449009293668e-06,
"loss": 0.2216,
"step": 340
},
{
"epoch": 0.573109243697479,
"grad_norm": 4.212760302913961,
"learning_rate": 8.107118006216732e-06,
"loss": 0.2994,
"step": 341
},
{
"epoch": 0.5747899159663865,
"grad_norm": 3.3746602463279185,
"learning_rate": 8.096765347883995e-06,
"loss": 0.1657,
"step": 342
},
{
"epoch": 0.5764705882352941,
"grad_norm": 3.9827276771268463,
"learning_rate": 8.086391106448965e-06,
"loss": 0.1944,
"step": 343
},
{
"epoch": 0.5781512605042017,
"grad_norm": 4.092011786287589,
"learning_rate": 8.075995354215578e-06,
"loss": 0.1775,
"step": 344
},
{
"epoch": 0.5798319327731093,
"grad_norm": 3.844261931463413,
"learning_rate": 8.065578163637686e-06,
"loss": 0.1849,
"step": 345
},
{
"epoch": 0.5815126050420169,
"grad_norm": 4.048687763988565,
"learning_rate": 8.055139607318558e-06,
"loss": 0.2156,
"step": 346
},
{
"epoch": 0.5831932773109244,
"grad_norm": 3.7618483841802335,
"learning_rate": 8.044679758010376e-06,
"loss": 0.2066,
"step": 347
},
{
"epoch": 0.584873949579832,
"grad_norm": 3.737106642987954,
"learning_rate": 8.03419868861372e-06,
"loss": 0.1807,
"step": 348
},
{
"epoch": 0.5865546218487395,
"grad_norm": 4.074003575121079,
"learning_rate": 8.023696472177068e-06,
"loss": 0.2227,
"step": 349
},
{
"epoch": 0.5882352941176471,
"grad_norm": 4.406312200787636,
"learning_rate": 8.013173181896283e-06,
"loss": 0.2373,
"step": 350
},
{
"epoch": 0.5899159663865546,
"grad_norm": 3.3827862112164917,
"learning_rate": 8.002628891114104e-06,
"loss": 0.1775,
"step": 351
},
{
"epoch": 0.5915966386554622,
"grad_norm": 5.429752189803184,
"learning_rate": 7.992063673319632e-06,
"loss": 0.2702,
"step": 352
},
{
"epoch": 0.5932773109243697,
"grad_norm": 4.120154578167831,
"learning_rate": 7.981477602147823e-06,
"loss": 0.2111,
"step": 353
},
{
"epoch": 0.5949579831932773,
"grad_norm": 2.9979780735807946,
"learning_rate": 7.97087075137897e-06,
"loss": 0.1397,
"step": 354
},
{
"epoch": 0.5966386554621849,
"grad_norm": 4.005290859935757,
"learning_rate": 7.960243194938192e-06,
"loss": 0.2193,
"step": 355
},
{
"epoch": 0.5983193277310924,
"grad_norm": 4.690992613506787,
"learning_rate": 7.949595006894917e-06,
"loss": 0.2113,
"step": 356
},
{
"epoch": 0.6,
"grad_norm": 3.6002782273862195,
"learning_rate": 7.938926261462366e-06,
"loss": 0.2194,
"step": 357
},
{
"epoch": 0.6016806722689075,
"grad_norm": 4.832797563091427,
"learning_rate": 7.928237032997037e-06,
"loss": 0.2212,
"step": 358
},
{
"epoch": 0.6033613445378151,
"grad_norm": 3.954408670704424,
"learning_rate": 7.917527395998183e-06,
"loss": 0.2093,
"step": 359
},
{
"epoch": 0.6050420168067226,
"grad_norm": 3.2290183482816897,
"learning_rate": 7.9067974251073e-06,
"loss": 0.1968,
"step": 360
},
{
"epoch": 0.6067226890756302,
"grad_norm": 4.651476660595037,
"learning_rate": 7.896047195107599e-06,
"loss": 0.2358,
"step": 361
},
{
"epoch": 0.6084033613445378,
"grad_norm": 4.357653576638546,
"learning_rate": 7.885276780923488e-06,
"loss": 0.2042,
"step": 362
},
{
"epoch": 0.6100840336134454,
"grad_norm": 5.0662328604426365,
"learning_rate": 7.87448625762005e-06,
"loss": 0.2529,
"step": 363
},
{
"epoch": 0.611764705882353,
"grad_norm": 4.096891845681164,
"learning_rate": 7.863675700402527e-06,
"loss": 0.2639,
"step": 364
},
{
"epoch": 0.6134453781512605,
"grad_norm": 4.404970446064358,
"learning_rate": 7.852845184615776e-06,
"loss": 0.2659,
"step": 365
},
{
"epoch": 0.6151260504201681,
"grad_norm": 4.205283221153625,
"learning_rate": 7.841994785743765e-06,
"loss": 0.2272,
"step": 366
},
{
"epoch": 0.6168067226890757,
"grad_norm": 5.230739104122307,
"learning_rate": 7.831124579409036e-06,
"loss": 0.2448,
"step": 367
},
{
"epoch": 0.6184873949579832,
"grad_norm": 3.6020763749502036,
"learning_rate": 7.820234641372182e-06,
"loss": 0.1714,
"step": 368
},
{
"epoch": 0.6201680672268908,
"grad_norm": 5.00315419944731,
"learning_rate": 7.809325047531315e-06,
"loss": 0.2037,
"step": 369
},
{
"epoch": 0.6218487394957983,
"grad_norm": 3.057913558877073,
"learning_rate": 7.798395873921542e-06,
"loss": 0.184,
"step": 370
},
{
"epoch": 0.6235294117647059,
"grad_norm": 4.100355290117442,
"learning_rate": 7.787447196714428e-06,
"loss": 0.2343,
"step": 371
},
{
"epoch": 0.6252100840336134,
"grad_norm": 4.158571878640022,
"learning_rate": 7.776479092217475e-06,
"loss": 0.249,
"step": 372
},
{
"epoch": 0.626890756302521,
"grad_norm": 4.066662933301616,
"learning_rate": 7.76549163687358e-06,
"loss": 0.1849,
"step": 373
},
{
"epoch": 0.6285714285714286,
"grad_norm": 4.089826297346638,
"learning_rate": 7.754484907260513e-06,
"loss": 0.1991,
"step": 374
},
{
"epoch": 0.6302521008403361,
"grad_norm": 4.234826231319271,
"learning_rate": 7.743458980090371e-06,
"loss": 0.2463,
"step": 375
},
{
"epoch": 0.6319327731092437,
"grad_norm": 3.265023140813625,
"learning_rate": 7.73241393220905e-06,
"loss": 0.1926,
"step": 376
},
{
"epoch": 0.6336134453781512,
"grad_norm": 3.134882453177233,
"learning_rate": 7.721349840595713e-06,
"loss": 0.1627,
"step": 377
},
{
"epoch": 0.6352941176470588,
"grad_norm": 4.423918239797939,
"learning_rate": 7.710266782362248e-06,
"loss": 0.2362,
"step": 378
},
{
"epoch": 0.6369747899159663,
"grad_norm": 3.9529856544671573,
"learning_rate": 7.69916483475273e-06,
"loss": 0.1822,
"step": 379
},
{
"epoch": 0.6386554621848739,
"grad_norm": 3.561131066780499,
"learning_rate": 7.688044075142888e-06,
"loss": 0.1633,
"step": 380
},
{
"epoch": 0.6403361344537815,
"grad_norm": 4.054438364074705,
"learning_rate": 7.676904581039559e-06,
"loss": 0.201,
"step": 381
},
{
"epoch": 0.6420168067226891,
"grad_norm": 3.19756418481164,
"learning_rate": 7.665746430080155e-06,
"loss": 0.1914,
"step": 382
},
{
"epoch": 0.6436974789915967,
"grad_norm": 5.087773350167841,
"learning_rate": 7.654569700032112e-06,
"loss": 0.2399,
"step": 383
},
{
"epoch": 0.6453781512605042,
"grad_norm": 4.8707614819292555,
"learning_rate": 7.643374468792364e-06,
"loss": 0.272,
"step": 384
},
{
"epoch": 0.6470588235294118,
"grad_norm": 3.7201873865620305,
"learning_rate": 7.63216081438678e-06,
"loss": 0.1768,
"step": 385
},
{
"epoch": 0.6487394957983194,
"grad_norm": 3.639086819183024,
"learning_rate": 7.620928814969636e-06,
"loss": 0.1856,
"step": 386
},
{
"epoch": 0.6504201680672269,
"grad_norm": 4.5114182364446185,
"learning_rate": 7.609678548823065e-06,
"loss": 0.2406,
"step": 387
},
{
"epoch": 0.6521008403361345,
"grad_norm": 4.37570960896787,
"learning_rate": 7.5984100943565055e-06,
"loss": 0.2287,
"step": 388
},
{
"epoch": 0.653781512605042,
"grad_norm": 4.71815661233923,
"learning_rate": 7.587123530106171e-06,
"loss": 0.2254,
"step": 389
},
{
"epoch": 0.6554621848739496,
"grad_norm": 4.948310397670234,
"learning_rate": 7.57581893473448e-06,
"loss": 0.2357,
"step": 390
},
{
"epoch": 0.6571428571428571,
"grad_norm": 4.0251665782039465,
"learning_rate": 7.564496387029532e-06,
"loss": 0.1872,
"step": 391
},
{
"epoch": 0.6588235294117647,
"grad_norm": 5.572454479666195,
"learning_rate": 7.553155965904535e-06,
"loss": 0.198,
"step": 392
},
{
"epoch": 0.6605042016806723,
"grad_norm": 6.1555918443054285,
"learning_rate": 7.541797750397277e-06,
"loss": 0.2744,
"step": 393
},
{
"epoch": 0.6621848739495798,
"grad_norm": 4.037230245840294,
"learning_rate": 7.530421819669558e-06,
"loss": 0.2114,
"step": 394
},
{
"epoch": 0.6638655462184874,
"grad_norm": 3.2696215968590883,
"learning_rate": 7.519028253006649e-06,
"loss": 0.2114,
"step": 395
},
{
"epoch": 0.6655462184873949,
"grad_norm": 4.073541716708416,
"learning_rate": 7.507617129816733e-06,
"loss": 0.2231,
"step": 396
},
{
"epoch": 0.6672268907563025,
"grad_norm": 3.409976478951058,
"learning_rate": 7.496188529630359e-06,
"loss": 0.1906,
"step": 397
},
{
"epoch": 0.66890756302521,
"grad_norm": 4.003033211601922,
"learning_rate": 7.484742532099878e-06,
"loss": 0.1951,
"step": 398
},
{
"epoch": 0.6705882352941176,
"grad_norm": 5.3967018780533245,
"learning_rate": 7.473279216998896e-06,
"loss": 0.2938,
"step": 399
},
{
"epoch": 0.6722689075630253,
"grad_norm": 4.650400594771881,
"learning_rate": 7.461798664221711e-06,
"loss": 0.2075,
"step": 400
},
{
"epoch": 0.6739495798319328,
"grad_norm": 4.445862374489302,
"learning_rate": 7.450300953782768e-06,
"loss": 0.1727,
"step": 401
},
{
"epoch": 0.6756302521008404,
"grad_norm": 3.6492298616881627,
"learning_rate": 7.438786165816084e-06,
"loss": 0.2474,
"step": 402
},
{
"epoch": 0.6773109243697479,
"grad_norm": 4.174927493743256,
"learning_rate": 7.427254380574705e-06,
"loss": 0.2298,
"step": 403
},
{
"epoch": 0.6789915966386555,
"grad_norm": 4.516892697613207,
"learning_rate": 7.415705678430138e-06,
"loss": 0.2445,
"step": 404
},
{
"epoch": 0.680672268907563,
"grad_norm": 5.1555804344893845,
"learning_rate": 7.404140139871797e-06,
"loss": 0.2457,
"step": 405
},
{
"epoch": 0.6823529411764706,
"grad_norm": 4.788105675096441,
"learning_rate": 7.392557845506433e-06,
"loss": 0.2454,
"step": 406
},
{
"epoch": 0.6840336134453782,
"grad_norm": 3.8556159103630128,
"learning_rate": 7.380958876057581e-06,
"loss": 0.1904,
"step": 407
},
{
"epoch": 0.6857142857142857,
"grad_norm": 3.923233518341112,
"learning_rate": 7.369343312364994e-06,
"loss": 0.191,
"step": 408
},
{
"epoch": 0.6873949579831933,
"grad_norm": 4.439644054359771,
"learning_rate": 7.357711235384079e-06,
"loss": 0.1808,
"step": 409
},
{
"epoch": 0.6890756302521008,
"grad_norm": 3.950994577812542,
"learning_rate": 7.346062726185332e-06,
"loss": 0.264,
"step": 410
},
{
"epoch": 0.6907563025210084,
"grad_norm": 4.63010670550712,
"learning_rate": 7.3343978659537775e-06,
"loss": 0.1907,
"step": 411
},
{
"epoch": 0.692436974789916,
"grad_norm": 3.2839801005028604,
"learning_rate": 7.3227167359883964e-06,
"loss": 0.1402,
"step": 412
},
{
"epoch": 0.6941176470588235,
"grad_norm": 3.983435177231738,
"learning_rate": 7.311019417701567e-06,
"loss": 0.2036,
"step": 413
},
{
"epoch": 0.6957983193277311,
"grad_norm": 4.171300817489276,
"learning_rate": 7.299305992618488e-06,
"loss": 0.2104,
"step": 414
},
{
"epoch": 0.6974789915966386,
"grad_norm": 3.291598476495542,
"learning_rate": 7.287576542376616e-06,
"loss": 0.1918,
"step": 415
},
{
"epoch": 0.6991596638655462,
"grad_norm": 3.0983336934959422,
"learning_rate": 7.275831148725101e-06,
"loss": 0.1717,
"step": 416
},
{
"epoch": 0.7008403361344537,
"grad_norm": 3.472013647791573,
"learning_rate": 7.264069893524207e-06,
"loss": 0.1839,
"step": 417
},
{
"epoch": 0.7025210084033613,
"grad_norm": 3.9532810808863905,
"learning_rate": 7.252292858744747e-06,
"loss": 0.1869,
"step": 418
},
{
"epoch": 0.704201680672269,
"grad_norm": 3.886560617548254,
"learning_rate": 7.24050012646751e-06,
"loss": 0.2145,
"step": 419
},
{
"epoch": 0.7058823529411765,
"grad_norm": 3.873829915587893,
"learning_rate": 7.2286917788826926e-06,
"loss": 0.1802,
"step": 420
},
{
"epoch": 0.7075630252100841,
"grad_norm": 3.8215403691812555,
"learning_rate": 7.216867898289319e-06,
"loss": 0.1784,
"step": 421
},
{
"epoch": 0.7092436974789916,
"grad_norm": 3.02923526458967,
"learning_rate": 7.2050285670946776e-06,
"loss": 0.1541,
"step": 422
},
{
"epoch": 0.7109243697478992,
"grad_norm": 3.894230763890687,
"learning_rate": 7.193173867813735e-06,
"loss": 0.1612,
"step": 423
},
{
"epoch": 0.7126050420168067,
"grad_norm": 4.346711078005211,
"learning_rate": 7.181303883068569e-06,
"loss": 0.2124,
"step": 424
},
{
"epoch": 0.7142857142857143,
"grad_norm": 4.470881599786187,
"learning_rate": 7.169418695587791e-06,
"loss": 0.2095,
"step": 425
},
{
"epoch": 0.7159663865546219,
"grad_norm": 4.802836466531396,
"learning_rate": 7.157518388205969e-06,
"loss": 0.1863,
"step": 426
},
{
"epoch": 0.7176470588235294,
"grad_norm": 5.540903785045774,
"learning_rate": 7.145603043863045e-06,
"loss": 0.3212,
"step": 427
},
{
"epoch": 0.719327731092437,
"grad_norm": 4.7139148814674305,
"learning_rate": 7.1336727456037716e-06,
"loss": 0.2206,
"step": 428
},
{
"epoch": 0.7210084033613445,
"grad_norm": 4.370838665402854,
"learning_rate": 7.121727576577116e-06,
"loss": 0.1965,
"step": 429
},
{
"epoch": 0.7226890756302521,
"grad_norm": 4.566376247147124,
"learning_rate": 7.109767620035689e-06,
"loss": 0.2085,
"step": 430
},
{
"epoch": 0.7243697478991596,
"grad_norm": 4.642592708671401,
"learning_rate": 7.097792959335169e-06,
"loss": 0.2326,
"step": 431
},
{
"epoch": 0.7260504201680672,
"grad_norm": 4.345117649290927,
"learning_rate": 7.0858036779337095e-06,
"loss": 0.2093,
"step": 432
},
{
"epoch": 0.7277310924369748,
"grad_norm": 3.374330056349454,
"learning_rate": 7.0737998593913686e-06,
"loss": 0.1793,
"step": 433
},
{
"epoch": 0.7294117647058823,
"grad_norm": 3.087076270827566,
"learning_rate": 7.061781587369518e-06,
"loss": 0.1545,
"step": 434
},
{
"epoch": 0.7310924369747899,
"grad_norm": 4.534902982370824,
"learning_rate": 7.049748945630269e-06,
"loss": 0.2009,
"step": 435
},
{
"epoch": 0.7327731092436974,
"grad_norm": 4.596100062760324,
"learning_rate": 7.037702018035878e-06,
"loss": 0.2578,
"step": 436
},
{
"epoch": 0.7344537815126051,
"grad_norm": 4.584632394943085,
"learning_rate": 7.0256408885481694e-06,
"loss": 0.1796,
"step": 437
},
{
"epoch": 0.7361344537815127,
"grad_norm": 5.400392053544899,
"learning_rate": 7.013565641227954e-06,
"loss": 0.2702,
"step": 438
},
{
"epoch": 0.7378151260504202,
"grad_norm": 4.092852923049045,
"learning_rate": 7.001476360234428e-06,
"loss": 0.2004,
"step": 439
},
{
"epoch": 0.7394957983193278,
"grad_norm": 3.5689773393131192,
"learning_rate": 6.989373129824605e-06,
"loss": 0.1711,
"step": 440
},
{
"epoch": 0.7411764705882353,
"grad_norm": 4.434405027954638,
"learning_rate": 6.977256034352713e-06,
"loss": 0.1862,
"step": 441
},
{
"epoch": 0.7428571428571429,
"grad_norm": 3.3402058102648784,
"learning_rate": 6.965125158269619e-06,
"loss": 0.1704,
"step": 442
},
{
"epoch": 0.7445378151260504,
"grad_norm": 4.450842620271656,
"learning_rate": 6.952980586122231e-06,
"loss": 0.179,
"step": 443
},
{
"epoch": 0.746218487394958,
"grad_norm": 4.5683781079735715,
"learning_rate": 6.940822402552914e-06,
"loss": 0.2515,
"step": 444
},
{
"epoch": 0.7478991596638656,
"grad_norm": 3.936528425785662,
"learning_rate": 6.928650692298898e-06,
"loss": 0.1535,
"step": 445
},
{
"epoch": 0.7495798319327731,
"grad_norm": 3.585367350636562,
"learning_rate": 6.916465540191692e-06,
"loss": 0.1368,
"step": 446
},
{
"epoch": 0.7512605042016807,
"grad_norm": 4.195744282723126,
"learning_rate": 6.904267031156482e-06,
"loss": 0.2003,
"step": 447
},
{
"epoch": 0.7529411764705882,
"grad_norm": 5.910759174258135,
"learning_rate": 6.892055250211552e-06,
"loss": 0.2944,
"step": 448
},
{
"epoch": 0.7546218487394958,
"grad_norm": 2.926901791069088,
"learning_rate": 6.879830282467682e-06,
"loss": 0.1387,
"step": 449
},
{
"epoch": 0.7563025210084033,
"grad_norm": 3.6549630804255946,
"learning_rate": 6.867592213127559e-06,
"loss": 0.1798,
"step": 450
},
{
"epoch": 0.7579831932773109,
"grad_norm": 3.8334460927490093,
"learning_rate": 6.855341127485183e-06,
"loss": 0.1726,
"step": 451
},
{
"epoch": 0.7596638655462185,
"grad_norm": 4.456207894333341,
"learning_rate": 6.84307711092527e-06,
"loss": 0.2311,
"step": 452
},
{
"epoch": 0.761344537815126,
"grad_norm": 4.414789487098786,
"learning_rate": 6.8308002489226645e-06,
"loss": 0.2103,
"step": 453
},
{
"epoch": 0.7630252100840336,
"grad_norm": 3.893046041679212,
"learning_rate": 6.81851062704173e-06,
"loss": 0.1943,
"step": 454
},
{
"epoch": 0.7647058823529411,
"grad_norm": 3.43196206922528,
"learning_rate": 6.806208330935766e-06,
"loss": 0.1767,
"step": 455
},
{
"epoch": 0.7663865546218488,
"grad_norm": 4.520503474294174,
"learning_rate": 6.793893446346405e-06,
"loss": 0.2123,
"step": 456
},
{
"epoch": 0.7680672268907563,
"grad_norm": 3.964065429569578,
"learning_rate": 6.7815660591030155e-06,
"loss": 0.2025,
"step": 457
},
{
"epoch": 0.7697478991596639,
"grad_norm": 4.19984280134145,
"learning_rate": 6.769226255122104e-06,
"loss": 0.1915,
"step": 458
},
{
"epoch": 0.7714285714285715,
"grad_norm": 4.839865608547356,
"learning_rate": 6.7568741204067145e-06,
"loss": 0.2495,
"step": 459
},
{
"epoch": 0.773109243697479,
"grad_norm": 3.7709931628629194,
"learning_rate": 6.744509741045835e-06,
"loss": 0.2231,
"step": 460
},
{
"epoch": 0.7747899159663866,
"grad_norm": 4.272844439888244,
"learning_rate": 6.7321332032137875e-06,
"loss": 0.1612,
"step": 461
},
{
"epoch": 0.7764705882352941,
"grad_norm": 3.9915676640206263,
"learning_rate": 6.719744593169642e-06,
"loss": 0.2039,
"step": 462
},
{
"epoch": 0.7781512605042017,
"grad_norm": 3.8781159196470063,
"learning_rate": 6.7073439972565955e-06,
"loss": 0.2488,
"step": 463
},
{
"epoch": 0.7798319327731092,
"grad_norm": 3.677549031841177,
"learning_rate": 6.6949315019013895e-06,
"loss": 0.1634,
"step": 464
},
{
"epoch": 0.7815126050420168,
"grad_norm": 3.27402087914958,
"learning_rate": 6.682507193613697e-06,
"loss": 0.1907,
"step": 465
},
{
"epoch": 0.7831932773109244,
"grad_norm": 4.551565457153353,
"learning_rate": 6.670071158985521e-06,
"loss": 0.2494,
"step": 466
},
{
"epoch": 0.7848739495798319,
"grad_norm": 4.295403358109363,
"learning_rate": 6.657623484690593e-06,
"loss": 0.2359,
"step": 467
},
{
"epoch": 0.7865546218487395,
"grad_norm": 4.889317330640271,
"learning_rate": 6.645164257483767e-06,
"loss": 0.2795,
"step": 468
},
{
"epoch": 0.788235294117647,
"grad_norm": 3.5002311648455975,
"learning_rate": 6.6326935642004165e-06,
"loss": 0.2332,
"step": 469
},
{
"epoch": 0.7899159663865546,
"grad_norm": 3.94464205537663,
"learning_rate": 6.62021149175583e-06,
"loss": 0.2435,
"step": 470
},
{
"epoch": 0.7915966386554621,
"grad_norm": 3.8028700500611565,
"learning_rate": 6.607718127144601e-06,
"loss": 0.1995,
"step": 471
},
{
"epoch": 0.7932773109243697,
"grad_norm": 4.525546500618434,
"learning_rate": 6.595213557440026e-06,
"loss": 0.2288,
"step": 472
},
{
"epoch": 0.7949579831932773,
"grad_norm": 5.606880801822294,
"learning_rate": 6.582697869793495e-06,
"loss": 0.2867,
"step": 473
},
{
"epoch": 0.7966386554621848,
"grad_norm": 3.6542402819017394,
"learning_rate": 6.570171151433887e-06,
"loss": 0.1618,
"step": 474
},
{
"epoch": 0.7983193277310925,
"grad_norm": 4.210723963029238,
"learning_rate": 6.557633489666958e-06,
"loss": 0.2028,
"step": 475
},
{
"epoch": 0.8,
"grad_norm": 3.2867443961571956,
"learning_rate": 6.545084971874738e-06,
"loss": 0.1748,
"step": 476
},
{
"epoch": 0.8016806722689076,
"grad_norm": 2.9348406753396694,
"learning_rate": 6.532525685514915e-06,
"loss": 0.1964,
"step": 477
},
{
"epoch": 0.8033613445378152,
"grad_norm": 4.528700088502344,
"learning_rate": 6.519955718120231e-06,
"loss": 0.2665,
"step": 478
},
{
"epoch": 0.8050420168067227,
"grad_norm": 3.942446224867525,
"learning_rate": 6.507375157297872e-06,
"loss": 0.1981,
"step": 479
},
{
"epoch": 0.8067226890756303,
"grad_norm": 3.0398230461352687,
"learning_rate": 6.494784090728852e-06,
"loss": 0.1563,
"step": 480
},
{
"epoch": 0.8084033613445378,
"grad_norm": 4.140405081211564,
"learning_rate": 6.482182606167409e-06,
"loss": 0.259,
"step": 481
},
{
"epoch": 0.8100840336134454,
"grad_norm": 3.560603255149939,
"learning_rate": 6.469570791440385e-06,
"loss": 0.1507,
"step": 482
},
{
"epoch": 0.8117647058823529,
"grad_norm": 3.864812587459039,
"learning_rate": 6.456948734446624e-06,
"loss": 0.2533,
"step": 483
},
{
"epoch": 0.8134453781512605,
"grad_norm": 3.978398759949487,
"learning_rate": 6.444316523156352e-06,
"loss": 0.1719,
"step": 484
},
{
"epoch": 0.8151260504201681,
"grad_norm": 3.906418335788084,
"learning_rate": 6.4316742456105645e-06,
"loss": 0.2105,
"step": 485
},
{
"epoch": 0.8168067226890756,
"grad_norm": 4.288716787663696,
"learning_rate": 6.419021989920416e-06,
"loss": 0.257,
"step": 486
},
{
"epoch": 0.8184873949579832,
"grad_norm": 3.613809860500844,
"learning_rate": 6.406359844266607e-06,
"loss": 0.2117,
"step": 487
},
{
"epoch": 0.8201680672268907,
"grad_norm": 4.094846884836178,
"learning_rate": 6.393687896898759e-06,
"loss": 0.2074,
"step": 488
},
{
"epoch": 0.8218487394957983,
"grad_norm": 3.7968850151738796,
"learning_rate": 6.381006236134817e-06,
"loss": 0.1863,
"step": 489
},
{
"epoch": 0.8235294117647058,
"grad_norm": 4.003377153677425,
"learning_rate": 6.368314950360416e-06,
"loss": 0.2318,
"step": 490
},
{
"epoch": 0.8252100840336134,
"grad_norm": 4.180943524371029,
"learning_rate": 6.355614128028277e-06,
"loss": 0.1799,
"step": 491
},
{
"epoch": 0.826890756302521,
"grad_norm": 3.506573280410032,
"learning_rate": 6.342903857657585e-06,
"loss": 0.1787,
"step": 492
},
{
"epoch": 0.8285714285714286,
"grad_norm": 4.146295337136301,
"learning_rate": 6.330184227833376e-06,
"loss": 0.2522,
"step": 493
},
{
"epoch": 0.8302521008403362,
"grad_norm": 3.604462904458598,
"learning_rate": 6.317455327205916e-06,
"loss": 0.1934,
"step": 494
},
{
"epoch": 0.8319327731092437,
"grad_norm": 3.368315681661413,
"learning_rate": 6.304717244490084e-06,
"loss": 0.1691,
"step": 495
},
{
"epoch": 0.8336134453781513,
"grad_norm": 4.391274581535866,
"learning_rate": 6.291970068464755e-06,
"loss": 0.1983,
"step": 496
},
{
"epoch": 0.8352941176470589,
"grad_norm": 3.9021942728409207,
"learning_rate": 6.279213887972179e-06,
"loss": 0.2088,
"step": 497
},
{
"epoch": 0.8369747899159664,
"grad_norm": 4.660288821078394,
"learning_rate": 6.266448791917364e-06,
"loss": 0.2459,
"step": 498
},
{
"epoch": 0.838655462184874,
"grad_norm": 3.6787269513356637,
"learning_rate": 6.253674869267457e-06,
"loss": 0.1722,
"step": 499
},
{
"epoch": 0.8403361344537815,
"grad_norm": 2.8892867743590096,
"learning_rate": 6.24089220905112e-06,
"loss": 0.1319,
"step": 500
},
{
"epoch": 0.8403361344537815,
"eval_loss": 0.19269125163555145,
"eval_runtime": 1.2338,
"eval_samples_per_second": 39.714,
"eval_steps_per_second": 10.536,
"step": 500
},
{
"epoch": 0.8420168067226891,
"grad_norm": 4.305155123990724,
"learning_rate": 6.228100900357914e-06,
"loss": 0.2207,
"step": 501
},
{
"epoch": 0.8436974789915966,
"grad_norm": 3.1815187896266863,
"learning_rate": 6.215301032337674e-06,
"loss": 0.1676,
"step": 502
},
{
"epoch": 0.8453781512605042,
"grad_norm": 3.857922746443263,
"learning_rate": 6.202492694199893e-06,
"loss": 0.1769,
"step": 503
},
{
"epoch": 0.8470588235294118,
"grad_norm": 4.386476438810581,
"learning_rate": 6.189675975213094e-06,
"loss": 0.2229,
"step": 504
},
{
"epoch": 0.8487394957983193,
"grad_norm": 4.114744331162852,
"learning_rate": 6.176850964704213e-06,
"loss": 0.1997,
"step": 505
},
{
"epoch": 0.8504201680672269,
"grad_norm": 3.6941593344593913,
"learning_rate": 6.164017752057972e-06,
"loss": 0.2237,
"step": 506
},
{
"epoch": 0.8521008403361344,
"grad_norm": 3.0148646159345804,
"learning_rate": 6.151176426716261e-06,
"loss": 0.1536,
"step": 507
},
{
"epoch": 0.853781512605042,
"grad_norm": 3.439147590128657,
"learning_rate": 6.13832707817751e-06,
"loss": 0.1735,
"step": 508
},
{
"epoch": 0.8554621848739495,
"grad_norm": 3.6968600719242692,
"learning_rate": 6.125469795996065e-06,
"loss": 0.1789,
"step": 509
},
{
"epoch": 0.8571428571428571,
"grad_norm": 3.355068222017964,
"learning_rate": 6.112604669781572e-06,
"loss": 0.2063,
"step": 510
},
{
"epoch": 0.8588235294117647,
"grad_norm": 3.3680814260117735,
"learning_rate": 6.099731789198344e-06,
"loss": 0.1371,
"step": 511
},
{
"epoch": 0.8605042016806723,
"grad_norm": 3.487954359774497,
"learning_rate": 6.0868512439647345e-06,
"loss": 0.1718,
"step": 512
},
{
"epoch": 0.8621848739495799,
"grad_norm": 3.890256891189335,
"learning_rate": 6.073963123852522e-06,
"loss": 0.1765,
"step": 513
},
{
"epoch": 0.8638655462184874,
"grad_norm": 3.7318020528328524,
"learning_rate": 6.061067518686277e-06,
"loss": 0.1817,
"step": 514
},
{
"epoch": 0.865546218487395,
"grad_norm": 3.7656918774904544,
"learning_rate": 6.048164518342734e-06,
"loss": 0.2006,
"step": 515
},
{
"epoch": 0.8672268907563025,
"grad_norm": 3.5973168312687487,
"learning_rate": 6.035254212750172e-06,
"loss": 0.2338,
"step": 516
},
{
"epoch": 0.8689075630252101,
"grad_norm": 4.876839752819946,
"learning_rate": 6.022336691887785e-06,
"loss": 0.2091,
"step": 517
},
{
"epoch": 0.8705882352941177,
"grad_norm": 3.868757342188446,
"learning_rate": 6.009412045785051e-06,
"loss": 0.2211,
"step": 518
},
{
"epoch": 0.8722689075630252,
"grad_norm": 3.788258159682757,
"learning_rate": 5.996480364521114e-06,
"loss": 0.2236,
"step": 519
},
{
"epoch": 0.8739495798319328,
"grad_norm": 3.9423457060806606,
"learning_rate": 5.983541738224141e-06,
"loss": 0.2087,
"step": 520
},
{
"epoch": 0.8756302521008403,
"grad_norm": 4.0221101831521215,
"learning_rate": 5.970596257070711e-06,
"loss": 0.2095,
"step": 521
},
{
"epoch": 0.8773109243697479,
"grad_norm": 3.5399279637868553,
"learning_rate": 5.957644011285173e-06,
"loss": 0.1803,
"step": 522
},
{
"epoch": 0.8789915966386554,
"grad_norm": 3.960179989285108,
"learning_rate": 5.944685091139026e-06,
"loss": 0.2075,
"step": 523
},
{
"epoch": 0.880672268907563,
"grad_norm": 3.791068687537994,
"learning_rate": 5.931719586950286e-06,
"loss": 0.2153,
"step": 524
},
{
"epoch": 0.8823529411764706,
"grad_norm": 3.6833034726015317,
"learning_rate": 5.918747589082853e-06,
"loss": 0.1893,
"step": 525
},
{
"epoch": 0.8840336134453781,
"grad_norm": 3.34569881611167,
"learning_rate": 5.905769187945889e-06,
"loss": 0.1874,
"step": 526
},
{
"epoch": 0.8857142857142857,
"grad_norm": 3.5706859104839443,
"learning_rate": 5.892784473993184e-06,
"loss": 0.1515,
"step": 527
},
{
"epoch": 0.8873949579831932,
"grad_norm": 3.2543249925897175,
"learning_rate": 5.879793537722525e-06,
"loss": 0.1589,
"step": 528
},
{
"epoch": 0.8890756302521008,
"grad_norm": 3.835128233080215,
"learning_rate": 5.8667964696750625e-06,
"loss": 0.1623,
"step": 529
},
{
"epoch": 0.8907563025210085,
"grad_norm": 4.145256581839058,
"learning_rate": 5.853793360434687e-06,
"loss": 0.2354,
"step": 530
},
{
"epoch": 0.892436974789916,
"grad_norm": 3.5184174760119684,
"learning_rate": 5.840784300627396e-06,
"loss": 0.2513,
"step": 531
},
{
"epoch": 0.8941176470588236,
"grad_norm": 3.85133303541918,
"learning_rate": 5.82776938092065e-06,
"loss": 0.179,
"step": 532
},
{
"epoch": 0.8957983193277311,
"grad_norm": 4.151107547880621,
"learning_rate": 5.814748692022761e-06,
"loss": 0.1681,
"step": 533
},
{
"epoch": 0.8974789915966387,
"grad_norm": 4.844244644198082,
"learning_rate": 5.801722324682243e-06,
"loss": 0.2482,
"step": 534
},
{
"epoch": 0.8991596638655462,
"grad_norm": 3.128553167302584,
"learning_rate": 5.788690369687188e-06,
"loss": 0.1688,
"step": 535
},
{
"epoch": 0.9008403361344538,
"grad_norm": 4.787289254729522,
"learning_rate": 5.775652917864633e-06,
"loss": 0.2269,
"step": 536
},
{
"epoch": 0.9025210084033614,
"grad_norm": 4.200201714416603,
"learning_rate": 5.762610060079926e-06,
"loss": 0.2303,
"step": 537
},
{
"epoch": 0.9042016806722689,
"grad_norm": 4.12447129777908,
"learning_rate": 5.749561887236088e-06,
"loss": 0.223,
"step": 538
},
{
"epoch": 0.9058823529411765,
"grad_norm": 4.09214914019045,
"learning_rate": 5.736508490273189e-06,
"loss": 0.1983,
"step": 539
},
{
"epoch": 0.907563025210084,
"grad_norm": 3.8622295106717903,
"learning_rate": 5.723449960167703e-06,
"loss": 0.1658,
"step": 540
},
{
"epoch": 0.9092436974789916,
"grad_norm": 3.880296503246165,
"learning_rate": 5.710386387931886e-06,
"loss": 0.2093,
"step": 541
},
{
"epoch": 0.9109243697478991,
"grad_norm": 3.8952684640098663,
"learning_rate": 5.697317864613127e-06,
"loss": 0.2113,
"step": 542
},
{
"epoch": 0.9126050420168067,
"grad_norm": 5.702686238624931,
"learning_rate": 5.684244481293335e-06,
"loss": 0.3463,
"step": 543
},
{
"epoch": 0.9142857142857143,
"grad_norm": 4.148502838419665,
"learning_rate": 5.671166329088278e-06,
"loss": 0.1912,
"step": 544
},
{
"epoch": 0.9159663865546218,
"grad_norm": 4.268707661002628,
"learning_rate": 5.658083499146968e-06,
"loss": 0.2138,
"step": 545
},
{
"epoch": 0.9176470588235294,
"grad_norm": 4.455105326523613,
"learning_rate": 5.644996082651018e-06,
"loss": 0.2346,
"step": 546
},
{
"epoch": 0.9193277310924369,
"grad_norm": 3.4501943037207456,
"learning_rate": 5.6319041708140045e-06,
"loss": 0.1399,
"step": 547
},
{
"epoch": 0.9210084033613445,
"grad_norm": 3.2939764503789197,
"learning_rate": 5.6188078548808366e-06,
"loss": 0.1744,
"step": 548
},
{
"epoch": 0.9226890756302522,
"grad_norm": 3.980663885603751,
"learning_rate": 5.6057072261271194e-06,
"loss": 0.1973,
"step": 549
},
{
"epoch": 0.9243697478991597,
"grad_norm": 4.204596605574354,
"learning_rate": 5.592602375858515e-06,
"loss": 0.2391,
"step": 550
},
{
"epoch": 0.9260504201680673,
"grad_norm": 3.8468761629959447,
"learning_rate": 5.579493395410105e-06,
"loss": 0.2292,
"step": 551
},
{
"epoch": 0.9277310924369748,
"grad_norm": 4.078220920961927,
"learning_rate": 5.566380376145762e-06,
"loss": 0.2246,
"step": 552
},
{
"epoch": 0.9294117647058824,
"grad_norm": 3.4471781455989836,
"learning_rate": 5.553263409457504e-06,
"loss": 0.1621,
"step": 553
},
{
"epoch": 0.9310924369747899,
"grad_norm": 3.8735614371403795,
"learning_rate": 5.540142586764862e-06,
"loss": 0.2191,
"step": 554
},
{
"epoch": 0.9327731092436975,
"grad_norm": 4.252312135545805,
"learning_rate": 5.527017999514239e-06,
"loss": 0.2322,
"step": 555
},
{
"epoch": 0.934453781512605,
"grad_norm": 3.82412550009896,
"learning_rate": 5.51388973917828e-06,
"loss": 0.1896,
"step": 556
},
{
"epoch": 0.9361344537815126,
"grad_norm": 4.155221147014321,
"learning_rate": 5.5007578972552246e-06,
"loss": 0.2503,
"step": 557
},
{
"epoch": 0.9378151260504202,
"grad_norm": 5.351585100726684,
"learning_rate": 5.4876225652682776e-06,
"loss": 0.2787,
"step": 558
},
{
"epoch": 0.9394957983193277,
"grad_norm": 3.1779913442831313,
"learning_rate": 5.474483834764968e-06,
"loss": 0.1735,
"step": 559
},
{
"epoch": 0.9411764705882353,
"grad_norm": 3.991015985678567,
"learning_rate": 5.46134179731651e-06,
"loss": 0.256,
"step": 560
},
{
"epoch": 0.9428571428571428,
"grad_norm": 4.20790696331099,
"learning_rate": 5.448196544517168e-06,
"loss": 0.2059,
"step": 561
},
{
"epoch": 0.9445378151260504,
"grad_norm": 3.4948615727030776,
"learning_rate": 5.435048167983613e-06,
"loss": 0.1985,
"step": 562
},
{
"epoch": 0.946218487394958,
"grad_norm": 3.473819533320583,
"learning_rate": 5.421896759354288e-06,
"loss": 0.156,
"step": 563
},
{
"epoch": 0.9478991596638655,
"grad_norm": 3.5100781536487915,
"learning_rate": 5.408742410288769e-06,
"loss": 0.1861,
"step": 564
},
{
"epoch": 0.9495798319327731,
"grad_norm": 4.3829409660703265,
"learning_rate": 5.395585212467124e-06,
"loss": 0.1835,
"step": 565
},
{
"epoch": 0.9512605042016806,
"grad_norm": 4.99516346331094,
"learning_rate": 5.382425257589277e-06,
"loss": 0.2931,
"step": 566
},
{
"epoch": 0.9529411764705882,
"grad_norm": 3.8273375889082772,
"learning_rate": 5.36926263737437e-06,
"loss": 0.1865,
"step": 567
},
{
"epoch": 0.9546218487394958,
"grad_norm": 3.4442977342234893,
"learning_rate": 5.356097443560116e-06,
"loss": 0.1864,
"step": 568
},
{
"epoch": 0.9563025210084034,
"grad_norm": 3.466342287642546,
"learning_rate": 5.342929767902168e-06,
"loss": 0.1711,
"step": 569
},
{
"epoch": 0.957983193277311,
"grad_norm": 4.347576000442588,
"learning_rate": 5.329759702173477e-06,
"loss": 0.2321,
"step": 570
},
{
"epoch": 0.9596638655462185,
"grad_norm": 3.3229433415787573,
"learning_rate": 5.316587338163649e-06,
"loss": 0.166,
"step": 571
},
{
"epoch": 0.9613445378151261,
"grad_norm": 4.009826790458202,
"learning_rate": 5.30341276767831e-06,
"loss": 0.1621,
"step": 572
},
{
"epoch": 0.9630252100840336,
"grad_norm": 4.076780910110831,
"learning_rate": 5.290236082538464e-06,
"loss": 0.2166,
"step": 573
},
{
"epoch": 0.9647058823529412,
"grad_norm": 3.7350445830670655,
"learning_rate": 5.27705737457985e-06,
"loss": 0.2014,
"step": 574
},
{
"epoch": 0.9663865546218487,
"grad_norm": 4.830893698589951,
"learning_rate": 5.2638767356523125e-06,
"loss": 0.2067,
"step": 575
},
{
"epoch": 0.9680672268907563,
"grad_norm": 3.400150909293186,
"learning_rate": 5.2506942576191466e-06,
"loss": 0.1654,
"step": 576
},
{
"epoch": 0.9697478991596639,
"grad_norm": 3.7474326387520596,
"learning_rate": 5.23751003235647e-06,
"loss": 0.1942,
"step": 577
},
{
"epoch": 0.9714285714285714,
"grad_norm": 3.7927268527910245,
"learning_rate": 5.224324151752575e-06,
"loss": 0.1804,
"step": 578
},
{
"epoch": 0.973109243697479,
"grad_norm": 3.4344555746240424,
"learning_rate": 5.211136707707293e-06,
"loss": 0.1766,
"step": 579
},
{
"epoch": 0.9747899159663865,
"grad_norm": 3.641318428686794,
"learning_rate": 5.197947792131348e-06,
"loss": 0.1824,
"step": 580
},
{
"epoch": 0.9764705882352941,
"grad_norm": 3.928761105749297,
"learning_rate": 5.184757496945726e-06,
"loss": 0.1529,
"step": 581
},
{
"epoch": 0.9781512605042016,
"grad_norm": 4.742597309312045,
"learning_rate": 5.1715659140810225e-06,
"loss": 0.2013,
"step": 582
},
{
"epoch": 0.9798319327731092,
"grad_norm": 4.980888920820449,
"learning_rate": 5.158373135476811e-06,
"loss": 0.2352,
"step": 583
},
{
"epoch": 0.9815126050420168,
"grad_norm": 3.485110939059201,
"learning_rate": 5.145179253080997e-06,
"loss": 0.1568,
"step": 584
},
{
"epoch": 0.9831932773109243,
"grad_norm": 3.964452907941118,
"learning_rate": 5.131984358849182e-06,
"loss": 0.1985,
"step": 585
},
{
"epoch": 0.984873949579832,
"grad_norm": 3.46258407863298,
"learning_rate": 5.118788544744016e-06,
"loss": 0.1444,
"step": 586
},
{
"epoch": 0.9865546218487395,
"grad_norm": 3.7112975214320083,
"learning_rate": 5.105591902734561e-06,
"loss": 0.1687,
"step": 587
},
{
"epoch": 0.9882352941176471,
"grad_norm": 4.641220841662963,
"learning_rate": 5.09239452479565e-06,
"loss": 0.189,
"step": 588
},
{
"epoch": 0.9899159663865547,
"grad_norm": 3.775450292074785,
"learning_rate": 5.079196502907246e-06,
"loss": 0.1787,
"step": 589
},
{
"epoch": 0.9915966386554622,
"grad_norm": 3.1329955863518633,
"learning_rate": 5.065997929053795e-06,
"loss": 0.1587,
"step": 590
},
{
"epoch": 0.9932773109243698,
"grad_norm": 4.076665571532509,
"learning_rate": 5.052798895223597e-06,
"loss": 0.1631,
"step": 591
},
{
"epoch": 0.9949579831932773,
"grad_norm": 4.242436811769809,
"learning_rate": 5.039599493408154e-06,
"loss": 0.2022,
"step": 592
},
{
"epoch": 0.9966386554621849,
"grad_norm": 3.6846927374699603,
"learning_rate": 5.026399815601533e-06,
"loss": 0.1845,
"step": 593
},
{
"epoch": 0.9983193277310924,
"grad_norm": 4.151082605904576,
"learning_rate": 5.0131999537997235e-06,
"loss": 0.193,
"step": 594
},
{
"epoch": 1.0,
"grad_norm": 4.076240791018785,
"learning_rate": 5e-06,
"loss": 0.1675,
"step": 595
},
{
"epoch": 1.0016806722689076,
"grad_norm": 2.8260558868123344,
"learning_rate": 4.986800046200278e-06,
"loss": 0.1033,
"step": 596
},
{
"epoch": 1.0033613445378151,
"grad_norm": 2.4638758993873733,
"learning_rate": 4.97360018439847e-06,
"loss": 0.0739,
"step": 597
},
{
"epoch": 1.0050420168067227,
"grad_norm": 2.731312250527938,
"learning_rate": 4.960400506591848e-06,
"loss": 0.078,
"step": 598
},
{
"epoch": 1.0067226890756302,
"grad_norm": 2.851110493741904,
"learning_rate": 4.947201104776404e-06,
"loss": 0.0819,
"step": 599
},
{
"epoch": 1.0084033613445378,
"grad_norm": 3.6514539147116216,
"learning_rate": 4.934002070946206e-06,
"loss": 0.0869,
"step": 600
},
{
"epoch": 1.0100840336134453,
"grad_norm": 3.007592142558298,
"learning_rate": 4.920803497092757e-06,
"loss": 0.0862,
"step": 601
},
{
"epoch": 1.011764705882353,
"grad_norm": 2.4923023998651224,
"learning_rate": 4.907605475204352e-06,
"loss": 0.0818,
"step": 602
},
{
"epoch": 1.0134453781512605,
"grad_norm": 3.9074853870198187,
"learning_rate": 4.894408097265441e-06,
"loss": 0.0982,
"step": 603
},
{
"epoch": 1.015126050420168,
"grad_norm": 2.738820573589542,
"learning_rate": 4.881211455255986e-06,
"loss": 0.061,
"step": 604
},
{
"epoch": 1.0168067226890756,
"grad_norm": 2.923472628759956,
"learning_rate": 4.86801564115082e-06,
"loss": 0.0947,
"step": 605
},
{
"epoch": 1.0184873949579831,
"grad_norm": 2.8463378417161027,
"learning_rate": 4.854820746919005e-06,
"loss": 0.0992,
"step": 606
},
{
"epoch": 1.0201680672268907,
"grad_norm": 3.3678742941776996,
"learning_rate": 4.8416268645231915e-06,
"loss": 0.0906,
"step": 607
},
{
"epoch": 1.0218487394957982,
"grad_norm": 2.294318309383224,
"learning_rate": 4.82843408591898e-06,
"loss": 0.0765,
"step": 608
},
{
"epoch": 1.0235294117647058,
"grad_norm": 2.5732816755240138,
"learning_rate": 4.815242503054277e-06,
"loss": 0.053,
"step": 609
},
{
"epoch": 1.0252100840336134,
"grad_norm": 3.2633128617028273,
"learning_rate": 4.802052207868654e-06,
"loss": 0.0768,
"step": 610
},
{
"epoch": 1.026890756302521,
"grad_norm": 3.494394392617399,
"learning_rate": 4.78886329229271e-06,
"loss": 0.0817,
"step": 611
},
{
"epoch": 1.0285714285714285,
"grad_norm": 2.7203476878745727,
"learning_rate": 4.775675848247427e-06,
"loss": 0.0735,
"step": 612
},
{
"epoch": 1.030252100840336,
"grad_norm": 4.00604536341265,
"learning_rate": 4.762489967643532e-06,
"loss": 0.093,
"step": 613
},
{
"epoch": 1.0319327731092436,
"grad_norm": 5.6713321751245775,
"learning_rate": 4.749305742380853e-06,
"loss": 0.1251,
"step": 614
},
{
"epoch": 1.0336134453781514,
"grad_norm": 3.69960957020687,
"learning_rate": 4.736123264347688e-06,
"loss": 0.0577,
"step": 615
},
{
"epoch": 1.035294117647059,
"grad_norm": 4.033751938519058,
"learning_rate": 4.7229426254201504e-06,
"loss": 0.0932,
"step": 616
},
{
"epoch": 1.0369747899159665,
"grad_norm": 4.34024174931469,
"learning_rate": 4.709763917461537e-06,
"loss": 0.1057,
"step": 617
},
{
"epoch": 1.038655462184874,
"grad_norm": 4.306617374054065,
"learning_rate": 4.696587232321691e-06,
"loss": 0.092,
"step": 618
},
{
"epoch": 1.0403361344537816,
"grad_norm": 3.948492964553929,
"learning_rate": 4.683412661836351e-06,
"loss": 0.1047,
"step": 619
},
{
"epoch": 1.0420168067226891,
"grad_norm": 3.7774272002812275,
"learning_rate": 4.6702402978265235e-06,
"loss": 0.0482,
"step": 620
},
{
"epoch": 1.0436974789915967,
"grad_norm": 3.774510074895978,
"learning_rate": 4.657070232097832e-06,
"loss": 0.0678,
"step": 621
},
{
"epoch": 1.0453781512605043,
"grad_norm": 4.231562137608742,
"learning_rate": 4.643902556439885e-06,
"loss": 0.0613,
"step": 622
},
{
"epoch": 1.0470588235294118,
"grad_norm": 3.470487492807781,
"learning_rate": 4.630737362625631e-06,
"loss": 0.0759,
"step": 623
},
{
"epoch": 1.0487394957983194,
"grad_norm": 3.309954439385579,
"learning_rate": 4.6175747424107234e-06,
"loss": 0.0688,
"step": 624
},
{
"epoch": 1.050420168067227,
"grad_norm": 6.516977912688737,
"learning_rate": 4.604414787532877e-06,
"loss": 0.128,
"step": 625
},
{
"epoch": 1.0521008403361345,
"grad_norm": 2.82864423728578,
"learning_rate": 4.591257589711233e-06,
"loss": 0.0733,
"step": 626
},
{
"epoch": 1.053781512605042,
"grad_norm": 4.24023591542261,
"learning_rate": 4.578103240645714e-06,
"loss": 0.0933,
"step": 627
},
{
"epoch": 1.0554621848739496,
"grad_norm": 3.5934583792735495,
"learning_rate": 4.5649518320163885e-06,
"loss": 0.0521,
"step": 628
},
{
"epoch": 1.0571428571428572,
"grad_norm": 2.4529016487205757,
"learning_rate": 4.551803455482833e-06,
"loss": 0.0679,
"step": 629
},
{
"epoch": 1.0588235294117647,
"grad_norm": 3.6105584004853983,
"learning_rate": 4.53865820268349e-06,
"loss": 0.0784,
"step": 630
},
{
"epoch": 1.0605042016806723,
"grad_norm": 4.102058266909981,
"learning_rate": 4.525516165235034e-06,
"loss": 0.1085,
"step": 631
},
{
"epoch": 1.0621848739495798,
"grad_norm": 3.2927361829912827,
"learning_rate": 4.512377434731724e-06,
"loss": 0.0807,
"step": 632
},
{
"epoch": 1.0638655462184874,
"grad_norm": 2.42452755770337,
"learning_rate": 4.499242102744777e-06,
"loss": 0.0536,
"step": 633
},
{
"epoch": 1.065546218487395,
"grad_norm": 4.397700003303606,
"learning_rate": 4.486110260821722e-06,
"loss": 0.0999,
"step": 634
},
{
"epoch": 1.0672268907563025,
"grad_norm": 4.5796843219518415,
"learning_rate": 4.4729820004857625e-06,
"loss": 0.1061,
"step": 635
},
{
"epoch": 1.06890756302521,
"grad_norm": 3.249155400076322,
"learning_rate": 4.45985741323514e-06,
"loss": 0.0637,
"step": 636
},
{
"epoch": 1.0705882352941176,
"grad_norm": 3.195791033403845,
"learning_rate": 4.446736590542497e-06,
"loss": 0.0495,
"step": 637
},
{
"epoch": 1.0722689075630252,
"grad_norm": 4.633287166254214,
"learning_rate": 4.4336196238542394e-06,
"loss": 0.1159,
"step": 638
},
{
"epoch": 1.0739495798319327,
"grad_norm": 3.7403134736334422,
"learning_rate": 4.420506604589897e-06,
"loss": 0.0908,
"step": 639
},
{
"epoch": 1.0756302521008403,
"grad_norm": 3.455796997777909,
"learning_rate": 4.407397624141487e-06,
"loss": 0.0933,
"step": 640
},
{
"epoch": 1.0773109243697478,
"grad_norm": 4.311267532340438,
"learning_rate": 4.394292773872882e-06,
"loss": 0.084,
"step": 641
},
{
"epoch": 1.0789915966386554,
"grad_norm": 3.1805108046865107,
"learning_rate": 4.381192145119165e-06,
"loss": 0.0899,
"step": 642
},
{
"epoch": 1.080672268907563,
"grad_norm": 4.48640343008081,
"learning_rate": 4.368095829185999e-06,
"loss": 0.0764,
"step": 643
},
{
"epoch": 1.0823529411764705,
"grad_norm": 3.7052198008043495,
"learning_rate": 4.355003917348985e-06,
"loss": 0.0927,
"step": 644
},
{
"epoch": 1.084033613445378,
"grad_norm": 2.728516187495892,
"learning_rate": 4.341916500853034e-06,
"loss": 0.0508,
"step": 645
},
{
"epoch": 1.0857142857142856,
"grad_norm": 3.1223532912143828,
"learning_rate": 4.3288336709117246e-06,
"loss": 0.07,
"step": 646
},
{
"epoch": 1.0873949579831932,
"grad_norm": 2.5244693951878205,
"learning_rate": 4.315755518706667e-06,
"loss": 0.0675,
"step": 647
},
{
"epoch": 1.0890756302521007,
"grad_norm": 3.005200455986411,
"learning_rate": 4.302682135386873e-06,
"loss": 0.0769,
"step": 648
},
{
"epoch": 1.0907563025210083,
"grad_norm": 3.1629945090674543,
"learning_rate": 4.289613612068118e-06,
"loss": 0.0588,
"step": 649
},
{
"epoch": 1.092436974789916,
"grad_norm": 3.2297557131977874,
"learning_rate": 4.276550039832299e-06,
"loss": 0.0618,
"step": 650
},
{
"epoch": 1.0941176470588236,
"grad_norm": 3.8464690490881597,
"learning_rate": 4.263491509726812e-06,
"loss": 0.0811,
"step": 651
},
{
"epoch": 1.0957983193277312,
"grad_norm": 2.483505006443395,
"learning_rate": 4.250438112763911e-06,
"loss": 0.0409,
"step": 652
},
{
"epoch": 1.0974789915966388,
"grad_norm": 2.6783407216299455,
"learning_rate": 4.237389939920075e-06,
"loss": 0.067,
"step": 653
},
{
"epoch": 1.0991596638655463,
"grad_norm": 3.4364206440555,
"learning_rate": 4.224347082135367e-06,
"loss": 0.083,
"step": 654
},
{
"epoch": 1.1008403361344539,
"grad_norm": 4.483374784861664,
"learning_rate": 4.211309630312813e-06,
"loss": 0.0707,
"step": 655
},
{
"epoch": 1.1025210084033614,
"grad_norm": 2.8804417997633776,
"learning_rate": 4.198277675317758e-06,
"loss": 0.0783,
"step": 656
},
{
"epoch": 1.104201680672269,
"grad_norm": 4.024917127829357,
"learning_rate": 4.185251307977241e-06,
"loss": 0.0952,
"step": 657
},
{
"epoch": 1.1058823529411765,
"grad_norm": 3.029919492161537,
"learning_rate": 4.17223061907935e-06,
"loss": 0.0703,
"step": 658
},
{
"epoch": 1.107563025210084,
"grad_norm": 4.159762100625219,
"learning_rate": 4.159215699372605e-06,
"loss": 0.0705,
"step": 659
},
{
"epoch": 1.1092436974789917,
"grad_norm": 2.585359252326579,
"learning_rate": 4.146206639565313e-06,
"loss": 0.0588,
"step": 660
},
{
"epoch": 1.1109243697478992,
"grad_norm": 3.1077598416909624,
"learning_rate": 4.133203530324938e-06,
"loss": 0.0714,
"step": 661
},
{
"epoch": 1.1126050420168068,
"grad_norm": 3.8441626427860274,
"learning_rate": 4.120206462277478e-06,
"loss": 0.0874,
"step": 662
},
{
"epoch": 1.1142857142857143,
"grad_norm": 2.9422268635935436,
"learning_rate": 4.107215526006818e-06,
"loss": 0.0598,
"step": 663
},
{
"epoch": 1.1159663865546219,
"grad_norm": 3.1192685111992935,
"learning_rate": 4.094230812054113e-06,
"loss": 0.0721,
"step": 664
},
{
"epoch": 1.1176470588235294,
"grad_norm": 3.53377011926501,
"learning_rate": 4.081252410917148e-06,
"loss": 0.0682,
"step": 665
},
{
"epoch": 1.119327731092437,
"grad_norm": 3.674753388626168,
"learning_rate": 4.068280413049715e-06,
"loss": 0.0825,
"step": 666
},
{
"epoch": 1.1210084033613446,
"grad_norm": 3.9666213447083067,
"learning_rate": 4.0553149088609745e-06,
"loss": 0.1018,
"step": 667
},
{
"epoch": 1.122689075630252,
"grad_norm": 3.6420396886380724,
"learning_rate": 4.042355988714828e-06,
"loss": 0.0831,
"step": 668
},
{
"epoch": 1.1243697478991597,
"grad_norm": 3.9967836672599324,
"learning_rate": 4.02940374292929e-06,
"loss": 0.1076,
"step": 669
},
{
"epoch": 1.1260504201680672,
"grad_norm": 3.607820316984983,
"learning_rate": 4.01645826177586e-06,
"loss": 0.0665,
"step": 670
},
{
"epoch": 1.1277310924369748,
"grad_norm": 4.383577038304497,
"learning_rate": 4.003519635478889e-06,
"loss": 0.0983,
"step": 671
},
{
"epoch": 1.1294117647058823,
"grad_norm": 3.045483064524733,
"learning_rate": 3.99058795421495e-06,
"loss": 0.0738,
"step": 672
},
{
"epoch": 1.13109243697479,
"grad_norm": 3.5868924253500696,
"learning_rate": 3.977663308112216e-06,
"loss": 0.078,
"step": 673
},
{
"epoch": 1.1327731092436975,
"grad_norm": 3.9552854654283243,
"learning_rate": 3.9647457872498295e-06,
"loss": 0.1083,
"step": 674
},
{
"epoch": 1.134453781512605,
"grad_norm": 3.123737566538539,
"learning_rate": 3.951835481657268e-06,
"loss": 0.0597,
"step": 675
},
{
"epoch": 1.1361344537815126,
"grad_norm": 4.412159709913494,
"learning_rate": 3.938932481313725e-06,
"loss": 0.0911,
"step": 676
},
{
"epoch": 1.1378151260504201,
"grad_norm": 5.484517514561643,
"learning_rate": 3.926036876147479e-06,
"loss": 0.0883,
"step": 677
},
{
"epoch": 1.1394957983193277,
"grad_norm": 4.06808403561468,
"learning_rate": 3.913148756035266e-06,
"loss": 0.0825,
"step": 678
},
{
"epoch": 1.1411764705882352,
"grad_norm": 2.9469250740798554,
"learning_rate": 3.9002682108016585e-06,
"loss": 0.0719,
"step": 679
},
{
"epoch": 1.1428571428571428,
"grad_norm": 2.48192021077067,
"learning_rate": 3.887395330218429e-06,
"loss": 0.0454,
"step": 680
},
{
"epoch": 1.1445378151260504,
"grad_norm": 3.4541071171494875,
"learning_rate": 3.874530204003937e-06,
"loss": 0.0899,
"step": 681
},
{
"epoch": 1.146218487394958,
"grad_norm": 3.4381814076738797,
"learning_rate": 3.861672921822493e-06,
"loss": 0.0729,
"step": 682
},
{
"epoch": 1.1478991596638655,
"grad_norm": 3.5999940971089748,
"learning_rate": 3.848823573283742e-06,
"loss": 0.0708,
"step": 683
},
{
"epoch": 1.149579831932773,
"grad_norm": 3.649458495375344,
"learning_rate": 3.83598224794203e-06,
"loss": 0.083,
"step": 684
},
{
"epoch": 1.1512605042016806,
"grad_norm": 3.375981309835745,
"learning_rate": 3.823149035295789e-06,
"loss": 0.0819,
"step": 685
},
{
"epoch": 1.1529411764705881,
"grad_norm": 3.970338952510589,
"learning_rate": 3.8103240247869077e-06,
"loss": 0.0652,
"step": 686
},
{
"epoch": 1.1546218487394957,
"grad_norm": 2.7421482149314342,
"learning_rate": 3.7975073058001092e-06,
"loss": 0.0586,
"step": 687
},
{
"epoch": 1.1563025210084033,
"grad_norm": 3.268321864940929,
"learning_rate": 3.7846989676623265e-06,
"loss": 0.0926,
"step": 688
},
{
"epoch": 1.1579831932773108,
"grad_norm": 3.586775391618902,
"learning_rate": 3.7718990996420875e-06,
"loss": 0.0733,
"step": 689
},
{
"epoch": 1.1596638655462184,
"grad_norm": 3.3748255405441534,
"learning_rate": 3.7591077909488817e-06,
"loss": 0.0696,
"step": 690
},
{
"epoch": 1.1613445378151261,
"grad_norm": 3.8838712829074242,
"learning_rate": 3.7463251307325432e-06,
"loss": 0.0734,
"step": 691
},
{
"epoch": 1.1630252100840337,
"grad_norm": 3.0131056465098713,
"learning_rate": 3.7335512080826363e-06,
"loss": 0.0698,
"step": 692
},
{
"epoch": 1.1647058823529413,
"grad_norm": 3.358638177868907,
"learning_rate": 3.720786112027822e-06,
"loss": 0.0905,
"step": 693
},
{
"epoch": 1.1663865546218488,
"grad_norm": 2.9670467971339773,
"learning_rate": 3.708029931535246e-06,
"loss": 0.0645,
"step": 694
},
{
"epoch": 1.1680672268907564,
"grad_norm": 4.426026291295541,
"learning_rate": 3.695282755509917e-06,
"loss": 0.0848,
"step": 695
},
{
"epoch": 1.169747899159664,
"grad_norm": 2.6536083172365896,
"learning_rate": 3.682544672794085e-06,
"loss": 0.0671,
"step": 696
},
{
"epoch": 1.1714285714285715,
"grad_norm": 2.784629510362822,
"learning_rate": 3.669815772166625e-06,
"loss": 0.0719,
"step": 697
},
{
"epoch": 1.173109243697479,
"grad_norm": 3.0434417394574007,
"learning_rate": 3.6570961423424155e-06,
"loss": 0.1106,
"step": 698
},
{
"epoch": 1.1747899159663866,
"grad_norm": 3.181246700838244,
"learning_rate": 3.644385871971725e-06,
"loss": 0.0604,
"step": 699
},
{
"epoch": 1.1764705882352942,
"grad_norm": 2.792428283134699,
"learning_rate": 3.6316850496395863e-06,
"loss": 0.0639,
"step": 700
},
{
"epoch": 1.1781512605042017,
"grad_norm": 3.3226768232667054,
"learning_rate": 3.618993763865185e-06,
"loss": 0.0792,
"step": 701
},
{
"epoch": 1.1798319327731093,
"grad_norm": 2.857505547869181,
"learning_rate": 3.6063121031012417e-06,
"loss": 0.0578,
"step": 702
},
{
"epoch": 1.1815126050420168,
"grad_norm": 3.2551731380467888,
"learning_rate": 3.5936401557333957e-06,
"loss": 0.0751,
"step": 703
},
{
"epoch": 1.1831932773109244,
"grad_norm": 2.8917175801511386,
"learning_rate": 3.5809780100795853e-06,
"loss": 0.0756,
"step": 704
},
{
"epoch": 1.184873949579832,
"grad_norm": 3.671906721272244,
"learning_rate": 3.568325754389438e-06,
"loss": 0.1162,
"step": 705
},
{
"epoch": 1.1865546218487395,
"grad_norm": 3.3302257093373004,
"learning_rate": 3.5556834768436498e-06,
"loss": 0.0841,
"step": 706
},
{
"epoch": 1.188235294117647,
"grad_norm": 4.600875308240711,
"learning_rate": 3.5430512655533774e-06,
"loss": 0.0997,
"step": 707
},
{
"epoch": 1.1899159663865546,
"grad_norm": 2.681493548071181,
"learning_rate": 3.5304292085596166e-06,
"loss": 0.0587,
"step": 708
},
{
"epoch": 1.1915966386554622,
"grad_norm": 4.633166803844994,
"learning_rate": 3.517817393832593e-06,
"loss": 0.082,
"step": 709
},
{
"epoch": 1.1932773109243697,
"grad_norm": 3.112723495852739,
"learning_rate": 3.505215909271149e-06,
"loss": 0.071,
"step": 710
},
{
"epoch": 1.1949579831932773,
"grad_norm": 4.400636552843737,
"learning_rate": 3.4926248427021293e-06,
"loss": 0.115,
"step": 711
},
{
"epoch": 1.1966386554621848,
"grad_norm": 3.6647916654013026,
"learning_rate": 3.48004428187977e-06,
"loss": 0.1058,
"step": 712
},
{
"epoch": 1.1983193277310924,
"grad_norm": 5.140848639409356,
"learning_rate": 3.4674743144850865e-06,
"loss": 0.135,
"step": 713
},
{
"epoch": 1.2,
"grad_norm": 3.150576093066749,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.0823,
"step": 714
},
{
"epoch": 1.2016806722689075,
"grad_norm": 2.812928703712718,
"learning_rate": 3.442366510333043e-06,
"loss": 0.0842,
"step": 715
},
{
"epoch": 1.203361344537815,
"grad_norm": 3.212135115117658,
"learning_rate": 3.4298288485661147e-06,
"loss": 0.0751,
"step": 716
},
{
"epoch": 1.2050420168067226,
"grad_norm": 3.799042933874438,
"learning_rate": 3.417302130206507e-06,
"loss": 0.083,
"step": 717
},
{
"epoch": 1.2067226890756302,
"grad_norm": 4.475769779293008,
"learning_rate": 3.404786442559976e-06,
"loss": 0.1013,
"step": 718
},
{
"epoch": 1.2084033613445377,
"grad_norm": 4.06485466865869,
"learning_rate": 3.3922818728554008e-06,
"loss": 0.0919,
"step": 719
},
{
"epoch": 1.2100840336134453,
"grad_norm": 2.41822548073786,
"learning_rate": 3.3797885082441717e-06,
"loss": 0.0514,
"step": 720
},
{
"epoch": 1.2117647058823529,
"grad_norm": 3.4968966029084623,
"learning_rate": 3.3673064357995844e-06,
"loss": 0.0861,
"step": 721
},
{
"epoch": 1.2134453781512604,
"grad_norm": 3.474316911856891,
"learning_rate": 3.3548357425162347e-06,
"loss": 0.0941,
"step": 722
},
{
"epoch": 1.2151260504201682,
"grad_norm": 2.865139040558747,
"learning_rate": 3.3423765153094097e-06,
"loss": 0.0683,
"step": 723
},
{
"epoch": 1.2168067226890757,
"grad_norm": 3.1100792648330926,
"learning_rate": 3.3299288410144813e-06,
"loss": 0.0632,
"step": 724
},
{
"epoch": 1.2184873949579833,
"grad_norm": 3.612471717248503,
"learning_rate": 3.3174928063863054e-06,
"loss": 0.0638,
"step": 725
},
{
"epoch": 1.2201680672268909,
"grad_norm": 2.571773313771785,
"learning_rate": 3.3050684980986105e-06,
"loss": 0.0516,
"step": 726
},
{
"epoch": 1.2218487394957984,
"grad_norm": 3.2675354986489147,
"learning_rate": 3.292656002743405e-06,
"loss": 0.0638,
"step": 727
},
{
"epoch": 1.223529411764706,
"grad_norm": 3.5021127435373023,
"learning_rate": 3.2802554068303595e-06,
"loss": 0.0783,
"step": 728
},
{
"epoch": 1.2252100840336135,
"grad_norm": 2.9029895571183446,
"learning_rate": 3.267866796786212e-06,
"loss": 0.0666,
"step": 729
},
{
"epoch": 1.226890756302521,
"grad_norm": 2.381919361917146,
"learning_rate": 3.255490258954167e-06,
"loss": 0.0358,
"step": 730
},
{
"epoch": 1.2285714285714286,
"grad_norm": 4.37685199400935,
"learning_rate": 3.2431258795932863e-06,
"loss": 0.1048,
"step": 731
},
{
"epoch": 1.2302521008403362,
"grad_norm": 3.563536843629901,
"learning_rate": 3.2307737448778977e-06,
"loss": 0.0861,
"step": 732
},
{
"epoch": 1.2319327731092438,
"grad_norm": 2.6074609627736387,
"learning_rate": 3.2184339408969857e-06,
"loss": 0.0591,
"step": 733
},
{
"epoch": 1.2336134453781513,
"grad_norm": 3.222255678882669,
"learning_rate": 3.206106553653596e-06,
"loss": 0.068,
"step": 734
},
{
"epoch": 1.2352941176470589,
"grad_norm": 3.0662427438038176,
"learning_rate": 3.1937916690642356e-06,
"loss": 0.059,
"step": 735
},
{
"epoch": 1.2369747899159664,
"grad_norm": 3.9738908619535196,
"learning_rate": 3.181489372958272e-06,
"loss": 0.0813,
"step": 736
},
{
"epoch": 1.238655462184874,
"grad_norm": 3.4998507094974367,
"learning_rate": 3.1691997510773376e-06,
"loss": 0.0738,
"step": 737
},
{
"epoch": 1.2403361344537815,
"grad_norm": 3.360066353012223,
"learning_rate": 3.1569228890747305e-06,
"loss": 0.0915,
"step": 738
},
{
"epoch": 1.242016806722689,
"grad_norm": 3.7980964862934936,
"learning_rate": 3.1446588725148186e-06,
"loss": 0.0792,
"step": 739
},
{
"epoch": 1.2436974789915967,
"grad_norm": 4.364395487305871,
"learning_rate": 3.132407786872442e-06,
"loss": 0.0953,
"step": 740
},
{
"epoch": 1.2453781512605042,
"grad_norm": 4.524790558239161,
"learning_rate": 3.120169717532319e-06,
"loss": 0.114,
"step": 741
},
{
"epoch": 1.2470588235294118,
"grad_norm": 2.501082399010883,
"learning_rate": 3.107944749788449e-06,
"loss": 0.0666,
"step": 742
},
{
"epoch": 1.2487394957983193,
"grad_norm": 3.165692341292902,
"learning_rate": 3.095732968843519e-06,
"loss": 0.0706,
"step": 743
},
{
"epoch": 1.250420168067227,
"grad_norm": 3.1228575299987136,
"learning_rate": 3.0835344598083095e-06,
"loss": 0.0681,
"step": 744
},
{
"epoch": 1.2521008403361344,
"grad_norm": 3.92884344358869,
"learning_rate": 3.0713493077011027e-06,
"loss": 0.0908,
"step": 745
},
{
"epoch": 1.253781512605042,
"grad_norm": 2.365391092045409,
"learning_rate": 3.059177597447087e-06,
"loss": 0.0598,
"step": 746
},
{
"epoch": 1.2554621848739496,
"grad_norm": 4.073880181854191,
"learning_rate": 3.0470194138777707e-06,
"loss": 0.0814,
"step": 747
},
{
"epoch": 1.2571428571428571,
"grad_norm": 4.477490809903387,
"learning_rate": 3.0348748417303826e-06,
"loss": 0.0878,
"step": 748
},
{
"epoch": 1.2588235294117647,
"grad_norm": 2.7792946761864497,
"learning_rate": 3.0227439656472878e-06,
"loss": 0.0501,
"step": 749
},
{
"epoch": 1.2605042016806722,
"grad_norm": 2.4283549892797485,
"learning_rate": 3.0106268701753967e-06,
"loss": 0.0481,
"step": 750
},
{
"epoch": 1.2621848739495798,
"grad_norm": 3.994164729590725,
"learning_rate": 2.9985236397655726e-06,
"loss": 0.0796,
"step": 751
},
{
"epoch": 1.2638655462184873,
"grad_norm": 3.2881789906717733,
"learning_rate": 2.986434358772048e-06,
"loss": 0.054,
"step": 752
},
{
"epoch": 1.265546218487395,
"grad_norm": 3.0735868424802653,
"learning_rate": 2.974359111451831e-06,
"loss": 0.0978,
"step": 753
},
{
"epoch": 1.2672268907563025,
"grad_norm": 2.6779749000507453,
"learning_rate": 2.962297981964124e-06,
"loss": 0.0501,
"step": 754
},
{
"epoch": 1.26890756302521,
"grad_norm": 3.9611547652129095,
"learning_rate": 2.950251054369733e-06,
"loss": 0.0769,
"step": 755
},
{
"epoch": 1.2705882352941176,
"grad_norm": 3.6957752079688095,
"learning_rate": 2.9382184126304834e-06,
"loss": 0.0728,
"step": 756
},
{
"epoch": 1.2722689075630251,
"grad_norm": 2.7774794174969757,
"learning_rate": 2.926200140608634e-06,
"loss": 0.0765,
"step": 757
},
{
"epoch": 1.2739495798319327,
"grad_norm": 2.604238358761592,
"learning_rate": 2.9141963220662917e-06,
"loss": 0.0557,
"step": 758
},
{
"epoch": 1.2756302521008402,
"grad_norm": 4.0260201986923585,
"learning_rate": 2.902207040664834e-06,
"loss": 0.0968,
"step": 759
},
{
"epoch": 1.2773109243697478,
"grad_norm": 3.830288993136352,
"learning_rate": 2.8902323799643116e-06,
"loss": 0.0868,
"step": 760
},
{
"epoch": 1.2789915966386554,
"grad_norm": 3.5769886193519693,
"learning_rate": 2.8782724234228876e-06,
"loss": 0.0647,
"step": 761
},
{
"epoch": 1.280672268907563,
"grad_norm": 5.759630745437997,
"learning_rate": 2.8663272543962305e-06,
"loss": 0.084,
"step": 762
},
{
"epoch": 1.2823529411764705,
"grad_norm": 3.1549545632891824,
"learning_rate": 2.8543969561369556e-06,
"loss": 0.0762,
"step": 763
},
{
"epoch": 1.284033613445378,
"grad_norm": 4.2731527378729846,
"learning_rate": 2.842481611794032e-06,
"loss": 0.0788,
"step": 764
},
{
"epoch": 1.2857142857142856,
"grad_norm": 3.4697998803710868,
"learning_rate": 2.83058130441221e-06,
"loss": 0.0547,
"step": 765
},
{
"epoch": 1.2873949579831931,
"grad_norm": 3.410299474955917,
"learning_rate": 2.818696116931431e-06,
"loss": 0.0837,
"step": 766
},
{
"epoch": 1.289075630252101,
"grad_norm": 3.5616877511635754,
"learning_rate": 2.8068261321862667e-06,
"loss": 0.0703,
"step": 767
},
{
"epoch": 1.2907563025210085,
"grad_norm": 3.0509370835754503,
"learning_rate": 2.794971432905323e-06,
"loss": 0.0577,
"step": 768
},
{
"epoch": 1.292436974789916,
"grad_norm": 2.8272150735022654,
"learning_rate": 2.7831321017106805e-06,
"loss": 0.0605,
"step": 769
},
{
"epoch": 1.2941176470588236,
"grad_norm": 4.147608557361858,
"learning_rate": 2.771308221117309e-06,
"loss": 0.1196,
"step": 770
},
{
"epoch": 1.2957983193277312,
"grad_norm": 3.3124997919407946,
"learning_rate": 2.7594998735324905e-06,
"loss": 0.0846,
"step": 771
},
{
"epoch": 1.2974789915966387,
"grad_norm": 3.8012586970938993,
"learning_rate": 2.7477071412552554e-06,
"loss": 0.096,
"step": 772
},
{
"epoch": 1.2991596638655463,
"grad_norm": 3.1553346227663543,
"learning_rate": 2.735930106475794e-06,
"loss": 0.0707,
"step": 773
},
{
"epoch": 1.3008403361344538,
"grad_norm": 2.7958280303718173,
"learning_rate": 2.724168851274901e-06,
"loss": 0.0715,
"step": 774
},
{
"epoch": 1.3025210084033614,
"grad_norm": 3.1372008613992772,
"learning_rate": 2.712423457623385e-06,
"loss": 0.0983,
"step": 775
},
{
"epoch": 1.304201680672269,
"grad_norm": 3.483988250360276,
"learning_rate": 2.7006940073815136e-06,
"loss": 0.0834,
"step": 776
},
{
"epoch": 1.3058823529411765,
"grad_norm": 3.2442585566233304,
"learning_rate": 2.6889805822984348e-06,
"loss": 0.0664,
"step": 777
},
{
"epoch": 1.307563025210084,
"grad_norm": 2.679757415437461,
"learning_rate": 2.6772832640116035e-06,
"loss": 0.0625,
"step": 778
},
{
"epoch": 1.3092436974789916,
"grad_norm": 3.715165066826765,
"learning_rate": 2.6656021340462246e-06,
"loss": 0.0976,
"step": 779
},
{
"epoch": 1.3109243697478992,
"grad_norm": 2.936192869439388,
"learning_rate": 2.6539372738146696e-06,
"loss": 0.0715,
"step": 780
},
{
"epoch": 1.3126050420168067,
"grad_norm": 4.020034837920706,
"learning_rate": 2.6422887646159234e-06,
"loss": 0.1028,
"step": 781
},
{
"epoch": 1.3142857142857143,
"grad_norm": 3.20595776083432,
"learning_rate": 2.6306566876350072e-06,
"loss": 0.0599,
"step": 782
},
{
"epoch": 1.3159663865546218,
"grad_norm": 3.1991584669083273,
"learning_rate": 2.619041123942419e-06,
"loss": 0.0848,
"step": 783
},
{
"epoch": 1.3176470588235294,
"grad_norm": 3.954684313309723,
"learning_rate": 2.607442154493568e-06,
"loss": 0.0737,
"step": 784
},
{
"epoch": 1.319327731092437,
"grad_norm": 2.6473103137935023,
"learning_rate": 2.5958598601282036e-06,
"loss": 0.0613,
"step": 785
},
{
"epoch": 1.3210084033613445,
"grad_norm": 2.826407561892185,
"learning_rate": 2.584294321569862e-06,
"loss": 0.0724,
"step": 786
},
{
"epoch": 1.322689075630252,
"grad_norm": 3.5898393382793494,
"learning_rate": 2.572745619425296e-06,
"loss": 0.0655,
"step": 787
},
{
"epoch": 1.3243697478991596,
"grad_norm": 3.252531020220819,
"learning_rate": 2.561213834183919e-06,
"loss": 0.0838,
"step": 788
},
{
"epoch": 1.3260504201680672,
"grad_norm": 2.530396975887401,
"learning_rate": 2.5496990462172344e-06,
"loss": 0.0399,
"step": 789
},
{
"epoch": 1.3277310924369747,
"grad_norm": 3.6056822958439327,
"learning_rate": 2.538201335778289e-06,
"loss": 0.0721,
"step": 790
},
{
"epoch": 1.3294117647058823,
"grad_norm": 2.94093572457041,
"learning_rate": 2.526720783001107e-06,
"loss": 0.07,
"step": 791
},
{
"epoch": 1.3310924369747898,
"grad_norm": 3.1830814602946877,
"learning_rate": 2.5152574679001236e-06,
"loss": 0.0855,
"step": 792
},
{
"epoch": 1.3327731092436974,
"grad_norm": 2.8576486748655636,
"learning_rate": 2.503811470369644e-06,
"loss": 0.081,
"step": 793
},
{
"epoch": 1.334453781512605,
"grad_norm": 3.754866972368774,
"learning_rate": 2.4923828701832682e-06,
"loss": 0.0722,
"step": 794
},
{
"epoch": 1.3361344537815127,
"grad_norm": 3.4010955615044955,
"learning_rate": 2.4809717469933543e-06,
"loss": 0.071,
"step": 795
},
{
"epoch": 1.3378151260504203,
"grad_norm": 3.309802090876078,
"learning_rate": 2.469578180330444e-06,
"loss": 0.0843,
"step": 796
},
{
"epoch": 1.3394957983193279,
"grad_norm": 3.719812421406123,
"learning_rate": 2.458202249602726e-06,
"loss": 0.0723,
"step": 797
},
{
"epoch": 1.3411764705882354,
"grad_norm": 2.964558896376801,
"learning_rate": 2.4468440340954664e-06,
"loss": 0.0696,
"step": 798
},
{
"epoch": 1.342857142857143,
"grad_norm": 3.111713776639778,
"learning_rate": 2.43550361297047e-06,
"loss": 0.0952,
"step": 799
},
{
"epoch": 1.3445378151260505,
"grad_norm": 3.9430475538259633,
"learning_rate": 2.4241810652655197e-06,
"loss": 0.0603,
"step": 800
},
{
"epoch": 1.346218487394958,
"grad_norm": 3.156839472343245,
"learning_rate": 2.4128764698938297e-06,
"loss": 0.0587,
"step": 801
},
{
"epoch": 1.3478991596638656,
"grad_norm": 2.9707320508794437,
"learning_rate": 2.4015899056434945e-06,
"loss": 0.0692,
"step": 802
},
{
"epoch": 1.3495798319327732,
"grad_norm": 3.6116340592816565,
"learning_rate": 2.390321451176936e-06,
"loss": 0.0675,
"step": 803
},
{
"epoch": 1.3512605042016808,
"grad_norm": 3.217311154453433,
"learning_rate": 2.379071185030365e-06,
"loss": 0.0732,
"step": 804
},
{
"epoch": 1.3529411764705883,
"grad_norm": 3.6926248506331576,
"learning_rate": 2.3678391856132203e-06,
"loss": 0.0571,
"step": 805
},
{
"epoch": 1.3546218487394959,
"grad_norm": 3.4897829856860816,
"learning_rate": 2.356625531207638e-06,
"loss": 0.0632,
"step": 806
},
{
"epoch": 1.3563025210084034,
"grad_norm": 2.9222780601737823,
"learning_rate": 2.345430299967888e-06,
"loss": 0.0645,
"step": 807
},
{
"epoch": 1.357983193277311,
"grad_norm": 2.6894558711212877,
"learning_rate": 2.334253569919846e-06,
"loss": 0.0704,
"step": 808
},
{
"epoch": 1.3596638655462185,
"grad_norm": 2.8905733444681325,
"learning_rate": 2.323095418960442e-06,
"loss": 0.0643,
"step": 809
},
{
"epoch": 1.361344537815126,
"grad_norm": 2.9154377681468997,
"learning_rate": 2.311955924857113e-06,
"loss": 0.08,
"step": 810
},
{
"epoch": 1.3630252100840337,
"grad_norm": 3.8727199729180435,
"learning_rate": 2.3008351652472714e-06,
"loss": 0.0867,
"step": 811
},
{
"epoch": 1.3647058823529412,
"grad_norm": 3.3845490730017267,
"learning_rate": 2.289733217637753e-06,
"loss": 0.0683,
"step": 812
},
{
"epoch": 1.3663865546218488,
"grad_norm": 3.373920847418989,
"learning_rate": 2.278650159404289e-06,
"loss": 0.0582,
"step": 813
},
{
"epoch": 1.3680672268907563,
"grad_norm": 3.2000406965333568,
"learning_rate": 2.267586067790952e-06,
"loss": 0.0761,
"step": 814
},
{
"epoch": 1.3697478991596639,
"grad_norm": 3.161658970701597,
"learning_rate": 2.2565410199096322e-06,
"loss": 0.0852,
"step": 815
},
{
"epoch": 1.3714285714285714,
"grad_norm": 3.7295821902310005,
"learning_rate": 2.245515092739488e-06,
"loss": 0.0645,
"step": 816
},
{
"epoch": 1.373109243697479,
"grad_norm": 3.691464649608567,
"learning_rate": 2.234508363126419e-06,
"loss": 0.1022,
"step": 817
},
{
"epoch": 1.3747899159663866,
"grad_norm": 3.474872908063579,
"learning_rate": 2.2235209077825264e-06,
"loss": 0.0904,
"step": 818
},
{
"epoch": 1.3764705882352941,
"grad_norm": 2.425412221727446,
"learning_rate": 2.2125528032855727e-06,
"loss": 0.054,
"step": 819
},
{
"epoch": 1.3781512605042017,
"grad_norm": 3.9835616677585004,
"learning_rate": 2.2016041260784604e-06,
"loss": 0.0738,
"step": 820
},
{
"epoch": 1.3798319327731092,
"grad_norm": 3.347359342136899,
"learning_rate": 2.1906749524686856e-06,
"loss": 0.0636,
"step": 821
},
{
"epoch": 1.3815126050420168,
"grad_norm": 2.658874860597347,
"learning_rate": 2.1797653586278193e-06,
"loss": 0.0655,
"step": 822
},
{
"epoch": 1.3831932773109243,
"grad_norm": 4.133400652102387,
"learning_rate": 2.168875420590965e-06,
"loss": 0.0849,
"step": 823
},
{
"epoch": 1.384873949579832,
"grad_norm": 3.4119688167441242,
"learning_rate": 2.158005214256236e-06,
"loss": 0.0762,
"step": 824
},
{
"epoch": 1.3865546218487395,
"grad_norm": 3.419640662395995,
"learning_rate": 2.147154815384226e-06,
"loss": 0.0725,
"step": 825
},
{
"epoch": 1.388235294117647,
"grad_norm": 3.2329083905175446,
"learning_rate": 2.136324299597474e-06,
"loss": 0.0656,
"step": 826
},
{
"epoch": 1.3899159663865546,
"grad_norm": 3.302465939051848,
"learning_rate": 2.12551374237995e-06,
"loss": 0.0841,
"step": 827
},
{
"epoch": 1.3915966386554621,
"grad_norm": 2.747466701190292,
"learning_rate": 2.1147232190765137e-06,
"loss": 0.0762,
"step": 828
},
{
"epoch": 1.3932773109243697,
"grad_norm": 3.529029515356708,
"learning_rate": 2.1039528048924043e-06,
"loss": 0.0614,
"step": 829
},
{
"epoch": 1.3949579831932772,
"grad_norm": 3.744954621522683,
"learning_rate": 2.0932025748927015e-06,
"loss": 0.0925,
"step": 830
},
{
"epoch": 1.3966386554621848,
"grad_norm": 2.8163888129776913,
"learning_rate": 2.0824726040018174e-06,
"loss": 0.0484,
"step": 831
},
{
"epoch": 1.3983193277310924,
"grad_norm": 3.021692852469359,
"learning_rate": 2.0717629670029653e-06,
"loss": 0.0567,
"step": 832
},
{
"epoch": 1.4,
"grad_norm": 3.8572687066728304,
"learning_rate": 2.061073738537635e-06,
"loss": 0.0679,
"step": 833
},
{
"epoch": 1.4016806722689075,
"grad_norm": 3.2949355182108127,
"learning_rate": 2.050404993105085e-06,
"loss": 0.0589,
"step": 834
},
{
"epoch": 1.403361344537815,
"grad_norm": 2.7633377494695495,
"learning_rate": 2.0397568050618095e-06,
"loss": 0.0609,
"step": 835
},
{
"epoch": 1.4050420168067226,
"grad_norm": 4.202873524739907,
"learning_rate": 2.0291292486210327e-06,
"loss": 0.1041,
"step": 836
},
{
"epoch": 1.4067226890756301,
"grad_norm": 3.0297174653829506,
"learning_rate": 2.018522397852178e-06,
"loss": 0.0577,
"step": 837
},
{
"epoch": 1.4084033613445377,
"grad_norm": 5.096519904720835,
"learning_rate": 2.0079363266803696e-06,
"loss": 0.1025,
"step": 838
},
{
"epoch": 1.4100840336134453,
"grad_norm": 3.4530721843432786,
"learning_rate": 1.9973711088858973e-06,
"loss": 0.0668,
"step": 839
},
{
"epoch": 1.4117647058823528,
"grad_norm": 4.742716235589666,
"learning_rate": 1.9868268181037186e-06,
"loss": 0.0825,
"step": 840
},
{
"epoch": 1.4134453781512604,
"grad_norm": 4.061809503594159,
"learning_rate": 1.976303527822933e-06,
"loss": 0.0916,
"step": 841
},
{
"epoch": 1.4151260504201681,
"grad_norm": 2.4976436771213466,
"learning_rate": 1.9658013113862806e-06,
"loss": 0.0509,
"step": 842
},
{
"epoch": 1.4168067226890757,
"grad_norm": 3.6299683241023644,
"learning_rate": 1.9553202419896256e-06,
"loss": 0.0788,
"step": 843
},
{
"epoch": 1.4184873949579833,
"grad_norm": 3.511030363971081,
"learning_rate": 1.944860392681442e-06,
"loss": 0.0861,
"step": 844
},
{
"epoch": 1.4201680672268908,
"grad_norm": 4.2560912674457825,
"learning_rate": 1.934421836362315e-06,
"loss": 0.0859,
"step": 845
},
{
"epoch": 1.4218487394957984,
"grad_norm": 2.7976331399633056,
"learning_rate": 1.9240046457844223e-06,
"loss": 0.0564,
"step": 846
},
{
"epoch": 1.423529411764706,
"grad_norm": 3.328285171512968,
"learning_rate": 1.913608893551036e-06,
"loss": 0.0787,
"step": 847
},
{
"epoch": 1.4252100840336135,
"grad_norm": 3.7878960189817876,
"learning_rate": 1.9032346521160066e-06,
"loss": 0.0724,
"step": 848
},
{
"epoch": 1.426890756302521,
"grad_norm": 2.8858346913152544,
"learning_rate": 1.8928819937832689e-06,
"loss": 0.0773,
"step": 849
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.7302815253012596,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.0723,
"step": 850
},
{
"epoch": 1.4302521008403362,
"grad_norm": 3.233101512998323,
"learning_rate": 1.8722417148877752e-06,
"loss": 0.0694,
"step": 851
},
{
"epoch": 1.4319327731092437,
"grad_norm": 3.3469400451646103,
"learning_rate": 1.8619542381787508e-06,
"loss": 0.0647,
"step": 852
},
{
"epoch": 1.4336134453781513,
"grad_norm": 3.133653343298944,
"learning_rate": 1.851688632278476e-06,
"loss": 0.0588,
"step": 853
},
{
"epoch": 1.4352941176470588,
"grad_norm": 3.6247472557751186,
"learning_rate": 1.8414449687337467e-06,
"loss": 0.0798,
"step": 854
},
{
"epoch": 1.4369747899159664,
"grad_norm": 3.8304422336907824,
"learning_rate": 1.8312233189384194e-06,
"loss": 0.0842,
"step": 855
},
{
"epoch": 1.438655462184874,
"grad_norm": 3.374646172759464,
"learning_rate": 1.821023754132933e-06,
"loss": 0.056,
"step": 856
},
{
"epoch": 1.4403361344537815,
"grad_norm": 3.934404310282199,
"learning_rate": 1.8108463454038022e-06,
"loss": 0.0747,
"step": 857
},
{
"epoch": 1.442016806722689,
"grad_norm": 3.363490582454245,
"learning_rate": 1.800691163683118e-06,
"loss": 0.0607,
"step": 858
},
{
"epoch": 1.4436974789915966,
"grad_norm": 3.3531978696324085,
"learning_rate": 1.790558279748067e-06,
"loss": 0.117,
"step": 859
},
{
"epoch": 1.4453781512605042,
"grad_norm": 2.9364782421206628,
"learning_rate": 1.780447764220422e-06,
"loss": 0.067,
"step": 860
},
{
"epoch": 1.4470588235294117,
"grad_norm": 2.927454347382001,
"learning_rate": 1.7703596875660645e-06,
"loss": 0.0559,
"step": 861
},
{
"epoch": 1.4487394957983193,
"grad_norm": 2.691782762898773,
"learning_rate": 1.7602941200944812e-06,
"loss": 0.051,
"step": 862
},
{
"epoch": 1.4504201680672268,
"grad_norm": 3.1537634566918156,
"learning_rate": 1.7502511319582855e-06,
"loss": 0.0588,
"step": 863
},
{
"epoch": 1.4521008403361344,
"grad_norm": 2.370144487772094,
"learning_rate": 1.7402307931527157e-06,
"loss": 0.0492,
"step": 864
},
{
"epoch": 1.453781512605042,
"grad_norm": 3.0105957574806217,
"learning_rate": 1.7302331735151594e-06,
"loss": 0.088,
"step": 865
},
{
"epoch": 1.4554621848739495,
"grad_norm": 3.279739938039434,
"learning_rate": 1.7202583427246633e-06,
"loss": 0.0902,
"step": 866
},
{
"epoch": 1.457142857142857,
"grad_norm": 2.7217645473901273,
"learning_rate": 1.7103063703014372e-06,
"loss": 0.0608,
"step": 867
},
{
"epoch": 1.4588235294117646,
"grad_norm": 2.5822955700611967,
"learning_rate": 1.7003773256063882e-06,
"loss": 0.0686,
"step": 868
},
{
"epoch": 1.4605042016806722,
"grad_norm": 3.63221129177691,
"learning_rate": 1.690471277840619e-06,
"loss": 0.0557,
"step": 869
},
{
"epoch": 1.46218487394958,
"grad_norm": 3.570715915651536,
"learning_rate": 1.6805882960449594e-06,
"loss": 0.0839,
"step": 870
},
{
"epoch": 1.4638655462184875,
"grad_norm": 4.328501618442257,
"learning_rate": 1.6707284490994746e-06,
"loss": 0.0749,
"step": 871
},
{
"epoch": 1.465546218487395,
"grad_norm": 3.433908960710609,
"learning_rate": 1.6608918057229944e-06,
"loss": 0.0646,
"step": 872
},
{
"epoch": 1.4672268907563026,
"grad_norm": 3.9068591126104173,
"learning_rate": 1.6510784344726294e-06,
"loss": 0.073,
"step": 873
},
{
"epoch": 1.4689075630252102,
"grad_norm": 2.2341793133831893,
"learning_rate": 1.6412884037432875e-06,
"loss": 0.0659,
"step": 874
},
{
"epoch": 1.4705882352941178,
"grad_norm": 3.0973040136017396,
"learning_rate": 1.6315217817672142e-06,
"loss": 0.0549,
"step": 875
},
{
"epoch": 1.4722689075630253,
"grad_norm": 2.90499093270013,
"learning_rate": 1.6217786366134953e-06,
"loss": 0.0617,
"step": 876
},
{
"epoch": 1.4739495798319329,
"grad_norm": 3.4404460266407746,
"learning_rate": 1.612059036187602e-06,
"loss": 0.0899,
"step": 877
},
{
"epoch": 1.4756302521008404,
"grad_norm": 2.3746789596139117,
"learning_rate": 1.6023630482309017e-06,
"loss": 0.0388,
"step": 878
},
{
"epoch": 1.477310924369748,
"grad_norm": 2.4723482281971836,
"learning_rate": 1.5926907403202001e-06,
"loss": 0.0697,
"step": 879
},
{
"epoch": 1.4789915966386555,
"grad_norm": 3.5714612174603197,
"learning_rate": 1.5830421798672568e-06,
"loss": 0.0533,
"step": 880
},
{
"epoch": 1.480672268907563,
"grad_norm": 3.755706414606266,
"learning_rate": 1.5734174341183284e-06,
"loss": 0.0821,
"step": 881
},
{
"epoch": 1.4823529411764707,
"grad_norm": 3.152624513233021,
"learning_rate": 1.5638165701536866e-06,
"loss": 0.0664,
"step": 882
},
{
"epoch": 1.4840336134453782,
"grad_norm": 3.911741470369773,
"learning_rate": 1.554239654887163e-06,
"loss": 0.0669,
"step": 883
},
{
"epoch": 1.4857142857142858,
"grad_norm": 2.9095761902447803,
"learning_rate": 1.544686755065677e-06,
"loss": 0.0747,
"step": 884
},
{
"epoch": 1.4873949579831933,
"grad_norm": 3.0824905880712468,
"learning_rate": 1.5351579372687658e-06,
"loss": 0.0633,
"step": 885
},
{
"epoch": 1.4890756302521009,
"grad_norm": 4.286560398167493,
"learning_rate": 1.525653267908132e-06,
"loss": 0.0796,
"step": 886
},
{
"epoch": 1.4907563025210084,
"grad_norm": 3.4776797987296173,
"learning_rate": 1.5161728132271674e-06,
"loss": 0.0847,
"step": 887
},
{
"epoch": 1.492436974789916,
"grad_norm": 2.453146716219204,
"learning_rate": 1.5067166393005055e-06,
"loss": 0.0501,
"step": 888
},
{
"epoch": 1.4941176470588236,
"grad_norm": 3.2190789264340722,
"learning_rate": 1.4972848120335453e-06,
"loss": 0.0785,
"step": 889
},
{
"epoch": 1.495798319327731,
"grad_norm": 3.0703657065913488,
"learning_rate": 1.4878773971620076e-06,
"loss": 0.0591,
"step": 890
},
{
"epoch": 1.4974789915966387,
"grad_norm": 3.139122876389468,
"learning_rate": 1.4784944602514662e-06,
"loss": 0.0512,
"step": 891
},
{
"epoch": 1.4991596638655462,
"grad_norm": 4.209187908547148,
"learning_rate": 1.4691360666968923e-06,
"loss": 0.1004,
"step": 892
},
{
"epoch": 1.5008403361344538,
"grad_norm": 3.0345447924030777,
"learning_rate": 1.4598022817222058e-06,
"loss": 0.0794,
"step": 893
},
{
"epoch": 1.5025210084033613,
"grad_norm": 3.890049029138699,
"learning_rate": 1.4504931703798086e-06,
"loss": 0.0656,
"step": 894
},
{
"epoch": 1.504201680672269,
"grad_norm": 3.314495638248082,
"learning_rate": 1.4412087975501459e-06,
"loss": 0.0622,
"step": 895
},
{
"epoch": 1.5058823529411764,
"grad_norm": 4.090407824067872,
"learning_rate": 1.4319492279412388e-06,
"loss": 0.0589,
"step": 896
},
{
"epoch": 1.507563025210084,
"grad_norm": 3.023129387472712,
"learning_rate": 1.4227145260882463e-06,
"loss": 0.0722,
"step": 897
},
{
"epoch": 1.5092436974789916,
"grad_norm": 4.878127172916561,
"learning_rate": 1.413504756353009e-06,
"loss": 0.1063,
"step": 898
},
{
"epoch": 1.5109243697478991,
"grad_norm": 3.6392372529586914,
"learning_rate": 1.4043199829235983e-06,
"loss": 0.077,
"step": 899
},
{
"epoch": 1.5126050420168067,
"grad_norm": 3.8202214737174747,
"learning_rate": 1.3951602698138773e-06,
"loss": 0.0704,
"step": 900
},
{
"epoch": 1.5142857142857142,
"grad_norm": 3.310249153920368,
"learning_rate": 1.3860256808630429e-06,
"loss": 0.0624,
"step": 901
},
{
"epoch": 1.5159663865546218,
"grad_norm": 2.5229149653319727,
"learning_rate": 1.3769162797351953e-06,
"loss": 0.0432,
"step": 902
},
{
"epoch": 1.5176470588235293,
"grad_norm": 4.604525464252146,
"learning_rate": 1.3678321299188802e-06,
"loss": 0.1086,
"step": 903
},
{
"epoch": 1.519327731092437,
"grad_norm": 2.9709223655943453,
"learning_rate": 1.3587732947266557e-06,
"loss": 0.0624,
"step": 904
},
{
"epoch": 1.5210084033613445,
"grad_norm": 3.331172827523415,
"learning_rate": 1.34973983729465e-06,
"loss": 0.0573,
"step": 905
},
{
"epoch": 1.522689075630252,
"grad_norm": 3.5693054861844367,
"learning_rate": 1.340731820582114e-06,
"loss": 0.0692,
"step": 906
},
{
"epoch": 1.5243697478991596,
"grad_norm": 3.4050102849137778,
"learning_rate": 1.3317493073709936e-06,
"loss": 0.0658,
"step": 907
},
{
"epoch": 1.5260504201680671,
"grad_norm": 2.7092516800684314,
"learning_rate": 1.3227923602654808e-06,
"loss": 0.0568,
"step": 908
},
{
"epoch": 1.5277310924369747,
"grad_norm": 2.948568924947018,
"learning_rate": 1.3138610416915887e-06,
"loss": 0.0655,
"step": 909
},
{
"epoch": 1.5294117647058822,
"grad_norm": 3.296455192826209,
"learning_rate": 1.3049554138967052e-06,
"loss": 0.0528,
"step": 910
},
{
"epoch": 1.5310924369747898,
"grad_norm": 2.567047257565957,
"learning_rate": 1.2960755389491703e-06,
"loss": 0.0467,
"step": 911
},
{
"epoch": 1.5327731092436974,
"grad_norm": 3.6779655883833304,
"learning_rate": 1.2872214787378306e-06,
"loss": 0.0677,
"step": 912
},
{
"epoch": 1.534453781512605,
"grad_norm": 4.383239615603696,
"learning_rate": 1.278393294971626e-06,
"loss": 0.1001,
"step": 913
},
{
"epoch": 1.5361344537815125,
"grad_norm": 3.239950652852017,
"learning_rate": 1.269591049179138e-06,
"loss": 0.0575,
"step": 914
},
{
"epoch": 1.53781512605042,
"grad_norm": 3.267287830498372,
"learning_rate": 1.2608148027081773e-06,
"loss": 0.0666,
"step": 915
},
{
"epoch": 1.5394957983193276,
"grad_norm": 3.8006140208799137,
"learning_rate": 1.2520646167253514e-06,
"loss": 0.0869,
"step": 916
},
{
"epoch": 1.5411764705882351,
"grad_norm": 3.632458756246524,
"learning_rate": 1.2433405522156334e-06,
"loss": 0.0534,
"step": 917
},
{
"epoch": 1.5428571428571427,
"grad_norm": 2.9078940235412976,
"learning_rate": 1.234642669981946e-06,
"loss": 0.0579,
"step": 918
},
{
"epoch": 1.5445378151260503,
"grad_norm": 3.7248910970111204,
"learning_rate": 1.2259710306447275e-06,
"loss": 0.0904,
"step": 919
},
{
"epoch": 1.5462184873949578,
"grad_norm": 3.306059100125909,
"learning_rate": 1.2173256946415214e-06,
"loss": 0.0705,
"step": 920
},
{
"epoch": 1.5478991596638656,
"grad_norm": 4.470317231270553,
"learning_rate": 1.2087067222265409e-06,
"loss": 0.0695,
"step": 921
},
{
"epoch": 1.5495798319327732,
"grad_norm": 3.749413820788246,
"learning_rate": 1.2001141734702625e-06,
"loss": 0.0758,
"step": 922
},
{
"epoch": 1.5512605042016807,
"grad_norm": 3.1346413732999325,
"learning_rate": 1.1915481082589998e-06,
"loss": 0.0591,
"step": 923
},
{
"epoch": 1.5529411764705883,
"grad_norm": 3.130619985905547,
"learning_rate": 1.1830085862944851e-06,
"loss": 0.0717,
"step": 924
},
{
"epoch": 1.5546218487394958,
"grad_norm": 2.2960691852895985,
"learning_rate": 1.17449566709346e-06,
"loss": 0.0382,
"step": 925
},
{
"epoch": 1.5563025210084034,
"grad_norm": 2.62733850882563,
"learning_rate": 1.166009409987251e-06,
"loss": 0.0532,
"step": 926
},
{
"epoch": 1.557983193277311,
"grad_norm": 2.987104215317316,
"learning_rate": 1.1575498741213682e-06,
"loss": 0.0616,
"step": 927
},
{
"epoch": 1.5596638655462185,
"grad_norm": 3.129195240880437,
"learning_rate": 1.1491171184550799e-06,
"loss": 0.0509,
"step": 928
},
{
"epoch": 1.561344537815126,
"grad_norm": 3.75801919908673,
"learning_rate": 1.1407112017610134e-06,
"loss": 0.0756,
"step": 929
},
{
"epoch": 1.5630252100840336,
"grad_norm": 4.590561742167558,
"learning_rate": 1.1323321826247347e-06,
"loss": 0.0709,
"step": 930
},
{
"epoch": 1.5647058823529412,
"grad_norm": 2.2659166137528097,
"learning_rate": 1.1239801194443507e-06,
"loss": 0.0469,
"step": 931
},
{
"epoch": 1.5663865546218487,
"grad_norm": 3.382390295282448,
"learning_rate": 1.115655070430096e-06,
"loss": 0.0571,
"step": 932
},
{
"epoch": 1.5680672268907563,
"grad_norm": 2.596026919066409,
"learning_rate": 1.107357093603924e-06,
"loss": 0.0474,
"step": 933
},
{
"epoch": 1.5697478991596638,
"grad_norm": 3.7190109549497956,
"learning_rate": 1.0990862467991132e-06,
"loss": 0.0767,
"step": 934
},
{
"epoch": 1.5714285714285714,
"grad_norm": 2.6518667247248713,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.0614,
"step": 935
},
{
"epoch": 1.573109243697479,
"grad_norm": 3.3105529179765663,
"learning_rate": 1.082626173640846e-06,
"loss": 0.0713,
"step": 936
},
{
"epoch": 1.5747899159663865,
"grad_norm": 2.9534768924558956,
"learning_rate": 1.0744370620069122e-06,
"loss": 0.0699,
"step": 937
},
{
"epoch": 1.576470588235294,
"grad_norm": 2.833311203890012,
"learning_rate": 1.066275309832584e-06,
"loss": 0.0548,
"step": 938
},
{
"epoch": 1.5781512605042018,
"grad_norm": 4.798887220822771,
"learning_rate": 1.0581409740017113e-06,
"loss": 0.0954,
"step": 939
},
{
"epoch": 1.5798319327731094,
"grad_norm": 4.148894262060034,
"learning_rate": 1.0500341112070605e-06,
"loss": 0.0681,
"step": 940
},
{
"epoch": 1.581512605042017,
"grad_norm": 3.2754454999274287,
"learning_rate": 1.0419547779499283e-06,
"loss": 0.0767,
"step": 941
},
{
"epoch": 1.5831932773109245,
"grad_norm": 3.3373777087154974,
"learning_rate": 1.0339030305397374e-06,
"loss": 0.0525,
"step": 942
},
{
"epoch": 1.584873949579832,
"grad_norm": 3.0386973905398,
"learning_rate": 1.025878925093653e-06,
"loss": 0.0705,
"step": 943
},
{
"epoch": 1.5865546218487396,
"grad_norm": 2.5493203288809747,
"learning_rate": 1.0178825175361846e-06,
"loss": 0.05,
"step": 944
},
{
"epoch": 1.5882352941176472,
"grad_norm": 3.7835221762872275,
"learning_rate": 1.0099138635988026e-06,
"loss": 0.0793,
"step": 945
},
{
"epoch": 1.5899159663865547,
"grad_norm": 2.9769518109780786,
"learning_rate": 1.0019730188195464e-06,
"loss": 0.0482,
"step": 946
},
{
"epoch": 1.5915966386554623,
"grad_norm": 2.9047376160951126,
"learning_rate": 9.940600385426347e-07,
"loss": 0.0775,
"step": 947
},
{
"epoch": 1.5932773109243699,
"grad_norm": 3.7163601063599585,
"learning_rate": 9.861749779180873e-07,
"loss": 0.0792,
"step": 948
},
{
"epoch": 1.5949579831932774,
"grad_norm": 4.409032506672899,
"learning_rate": 9.783178919013297e-07,
"loss": 0.074,
"step": 949
},
{
"epoch": 1.596638655462185,
"grad_norm": 3.1957793653873647,
"learning_rate": 9.704888352528257e-07,
"loss": 0.0815,
"step": 950
},
{
"epoch": 1.5983193277310925,
"grad_norm": 2.611065141281167,
"learning_rate": 9.626878625376784e-07,
"loss": 0.0511,
"step": 951
},
{
"epoch": 1.6,
"grad_norm": 3.107864414307667,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0726,
"step": 952
},
{
"epoch": 1.6016806722689076,
"grad_norm": 2.633882625896426,
"learning_rate": 9.471703861888398e-07,
"loss": 0.0695,
"step": 953
},
{
"epoch": 1.6033613445378152,
"grad_norm": 3.3594734416875345,
"learning_rate": 9.394539907051837e-07,
"loss": 0.0649,
"step": 954
},
{
"epoch": 1.6050420168067228,
"grad_norm": 2.461508884535206,
"learning_rate": 9.317658954541992e-07,
"loss": 0.0589,
"step": 955
},
{
"epoch": 1.6067226890756303,
"grad_norm": 4.539836468707796,
"learning_rate": 9.241061540185547e-07,
"loss": 0.0751,
"step": 956
},
{
"epoch": 1.6084033613445379,
"grad_norm": 3.3620051027546483,
"learning_rate": 9.164748197833039e-07,
"loss": 0.0533,
"step": 957
},
{
"epoch": 1.6100840336134454,
"grad_norm": 2.9983538488735184,
"learning_rate": 9.088719459355133e-07,
"loss": 0.0525,
"step": 958
},
{
"epoch": 1.611764705882353,
"grad_norm": 3.1377214405299765,
"learning_rate": 9.01297585463895e-07,
"loss": 0.0683,
"step": 959
},
{
"epoch": 1.6134453781512605,
"grad_norm": 3.2506007192617092,
"learning_rate": 8.937517911584321e-07,
"loss": 0.069,
"step": 960
},
{
"epoch": 1.615126050420168,
"grad_norm": 2.752232899637137,
"learning_rate": 8.862346156100188e-07,
"loss": 0.0452,
"step": 961
},
{
"epoch": 1.6168067226890757,
"grad_norm": 2.894169394808922,
"learning_rate": 8.787461112100837e-07,
"loss": 0.0609,
"step": 962
},
{
"epoch": 1.6184873949579832,
"grad_norm": 3.4459617448123434,
"learning_rate": 8.712863301502339e-07,
"loss": 0.0693,
"step": 963
},
{
"epoch": 1.6201680672268908,
"grad_norm": 4.242431845815356,
"learning_rate": 8.638553244218872e-07,
"loss": 0.085,
"step": 964
},
{
"epoch": 1.6218487394957983,
"grad_norm": 3.407760495616189,
"learning_rate": 8.56453145815907e-07,
"loss": 0.0675,
"step": 965
},
{
"epoch": 1.6235294117647059,
"grad_norm": 3.0306976690592413,
"learning_rate": 8.490798459222477e-07,
"loss": 0.0756,
"step": 966
},
{
"epoch": 1.6252100840336134,
"grad_norm": 2.84786975619905,
"learning_rate": 8.417354761295876e-07,
"loss": 0.0509,
"step": 967
},
{
"epoch": 1.626890756302521,
"grad_norm": 3.683064163624422,
"learning_rate": 8.344200876249803e-07,
"loss": 0.0771,
"step": 968
},
{
"epoch": 1.6285714285714286,
"grad_norm": 3.605480002311447,
"learning_rate": 8.271337313934869e-07,
"loss": 0.063,
"step": 969
},
{
"epoch": 1.6302521008403361,
"grad_norm": 3.906165742726037,
"learning_rate": 8.198764582178303e-07,
"loss": 0.0884,
"step": 970
},
{
"epoch": 1.6319327731092437,
"grad_norm": 3.210290270345182,
"learning_rate": 8.12648318678036e-07,
"loss": 0.061,
"step": 971
},
{
"epoch": 1.6336134453781512,
"grad_norm": 3.7077638002651745,
"learning_rate": 8.054493631510785e-07,
"loss": 0.0783,
"step": 972
},
{
"epoch": 1.6352941176470588,
"grad_norm": 3.00209725981241,
"learning_rate": 7.98279641810537e-07,
"loss": 0.0562,
"step": 973
},
{
"epoch": 1.6369747899159663,
"grad_norm": 2.755773701613939,
"learning_rate": 7.911392046262367e-07,
"loss": 0.0509,
"step": 974
},
{
"epoch": 1.638655462184874,
"grad_norm": 3.1721020032583405,
"learning_rate": 7.840281013639078e-07,
"loss": 0.0641,
"step": 975
},
{
"epoch": 1.6403361344537815,
"grad_norm": 3.2205453157951776,
"learning_rate": 7.769463815848344e-07,
"loss": 0.0721,
"step": 976
},
{
"epoch": 1.642016806722689,
"grad_norm": 3.0892435081374177,
"learning_rate": 7.698940946455125e-07,
"loss": 0.0687,
"step": 977
},
{
"epoch": 1.6436974789915966,
"grad_norm": 2.944546301438669,
"learning_rate": 7.628712896973006e-07,
"loss": 0.0472,
"step": 978
},
{
"epoch": 1.6453781512605041,
"grad_norm": 2.8245833208645563,
"learning_rate": 7.55878015686084e-07,
"loss": 0.054,
"step": 979
},
{
"epoch": 1.6470588235294117,
"grad_norm": 3.716112700120881,
"learning_rate": 7.489143213519301e-07,
"loss": 0.0695,
"step": 980
},
{
"epoch": 1.6487394957983192,
"grad_norm": 2.9711948721794377,
"learning_rate": 7.419802552287453e-07,
"loss": 0.0478,
"step": 981
},
{
"epoch": 1.6504201680672268,
"grad_norm": 2.9026496025038626,
"learning_rate": 7.350758656439455e-07,
"loss": 0.0527,
"step": 982
},
{
"epoch": 1.6521008403361344,
"grad_norm": 2.8302281641844287,
"learning_rate": 7.282012007181083e-07,
"loss": 0.0607,
"step": 983
},
{
"epoch": 1.653781512605042,
"grad_norm": 3.6058469193440392,
"learning_rate": 7.213563083646497e-07,
"loss": 0.0782,
"step": 984
},
{
"epoch": 1.6554621848739495,
"grad_norm": 3.7038887110593186,
"learning_rate": 7.145412362894771e-07,
"loss": 0.0737,
"step": 985
},
{
"epoch": 1.657142857142857,
"grad_norm": 3.0478216404333502,
"learning_rate": 7.077560319906696e-07,
"loss": 0.0848,
"step": 986
},
{
"epoch": 1.6588235294117646,
"grad_norm": 4.337845037581458,
"learning_rate": 7.010007427581378e-07,
"loss": 0.0821,
"step": 987
},
{
"epoch": 1.6605042016806721,
"grad_norm": 2.938832327531002,
"learning_rate": 6.942754156732978e-07,
"loss": 0.0957,
"step": 988
},
{
"epoch": 1.6621848739495797,
"grad_norm": 2.7760331192989316,
"learning_rate": 6.875800976087444e-07,
"loss": 0.0819,
"step": 989
},
{
"epoch": 1.6638655462184873,
"grad_norm": 2.617807812869756,
"learning_rate": 6.809148352279182e-07,
"loss": 0.0447,
"step": 990
},
{
"epoch": 1.6655462184873948,
"grad_norm": 2.9879632871750985,
"learning_rate": 6.742796749847908e-07,
"loss": 0.0593,
"step": 991
},
{
"epoch": 1.6672268907563024,
"grad_norm": 4.757558026364091,
"learning_rate": 6.676746631235282e-07,
"loss": 0.082,
"step": 992
},
{
"epoch": 1.66890756302521,
"grad_norm": 3.225928749204381,
"learning_rate": 6.61099845678183e-07,
"loss": 0.0554,
"step": 993
},
{
"epoch": 1.6705882352941175,
"grad_norm": 3.9904444022658496,
"learning_rate": 6.545552684723583e-07,
"loss": 0.0802,
"step": 994
},
{
"epoch": 1.6722689075630253,
"grad_norm": 3.3696742625546214,
"learning_rate": 6.480409771189027e-07,
"loss": 0.0722,
"step": 995
},
{
"epoch": 1.6739495798319328,
"grad_norm": 2.2477843738626855,
"learning_rate": 6.415570170195801e-07,
"loss": 0.0501,
"step": 996
},
{
"epoch": 1.6756302521008404,
"grad_norm": 3.6877994992049348,
"learning_rate": 6.351034333647615e-07,
"loss": 0.0725,
"step": 997
},
{
"epoch": 1.677310924369748,
"grad_norm": 3.9372860574992097,
"learning_rate": 6.286802711331097e-07,
"loss": 0.065,
"step": 998
},
{
"epoch": 1.6789915966386555,
"grad_norm": 4.448834128583502,
"learning_rate": 6.222875750912571e-07,
"loss": 0.0988,
"step": 999
},
{
"epoch": 1.680672268907563,
"grad_norm": 3.181414199217415,
"learning_rate": 6.159253897935069e-07,
"loss": 0.0622,
"step": 1000
},
{
"epoch": 1.680672268907563,
"eval_loss": 0.18958403170108795,
"eval_runtime": 1.1814,
"eval_samples_per_second": 41.477,
"eval_steps_per_second": 11.004,
"step": 1000
},
{
"epoch": 1.6823529411764706,
"grad_norm": 2.8667787652122203,
"learning_rate": 6.095937595815104e-07,
"loss": 0.0481,
"step": 1001
},
{
"epoch": 1.6840336134453782,
"grad_norm": 3.30814842995984,
"learning_rate": 6.032927285839674e-07,
"loss": 0.0781,
"step": 1002
},
{
"epoch": 1.6857142857142857,
"grad_norm": 3.273078173220105,
"learning_rate": 5.9702234071631e-07,
"loss": 0.0498,
"step": 1003
},
{
"epoch": 1.6873949579831933,
"grad_norm": 2.549502139708129,
"learning_rate": 5.907826396804062e-07,
"loss": 0.0573,
"step": 1004
},
{
"epoch": 1.6890756302521008,
"grad_norm": 2.7316380969088794,
"learning_rate": 5.845736689642472e-07,
"loss": 0.0696,
"step": 1005
},
{
"epoch": 1.6907563025210084,
"grad_norm": 2.9477520630378438,
"learning_rate": 5.783954718416468e-07,
"loss": 0.0517,
"step": 1006
},
{
"epoch": 1.692436974789916,
"grad_norm": 3.9905021450657863,
"learning_rate": 5.722480913719425e-07,
"loss": 0.0692,
"step": 1007
},
{
"epoch": 1.6941176470588235,
"grad_norm": 3.379977731241573,
"learning_rate": 5.661315703996905e-07,
"loss": 0.0574,
"step": 1008
},
{
"epoch": 1.695798319327731,
"grad_norm": 2.651344824049694,
"learning_rate": 5.600459515543733e-07,
"loss": 0.0536,
"step": 1009
},
{
"epoch": 1.6974789915966386,
"grad_norm": 3.839321294261383,
"learning_rate": 5.539912772500943e-07,
"loss": 0.0599,
"step": 1010
},
{
"epoch": 1.6991596638655462,
"grad_norm": 4.502537960250825,
"learning_rate": 5.47967589685292e-07,
"loss": 0.0764,
"step": 1011
},
{
"epoch": 1.7008403361344537,
"grad_norm": 3.066578903690047,
"learning_rate": 5.419749308424377e-07,
"loss": 0.0526,
"step": 1012
},
{
"epoch": 1.7025210084033613,
"grad_norm": 3.2961861585751944,
"learning_rate": 5.360133424877467e-07,
"loss": 0.0716,
"step": 1013
},
{
"epoch": 1.704201680672269,
"grad_norm": 4.278116825721235,
"learning_rate": 5.300828661708873e-07,
"loss": 0.1063,
"step": 1014
},
{
"epoch": 1.7058823529411766,
"grad_norm": 3.2236409490459597,
"learning_rate": 5.241835432246888e-07,
"loss": 0.0666,
"step": 1015
},
{
"epoch": 1.7075630252100842,
"grad_norm": 2.9602534255149524,
"learning_rate": 5.183154147648578e-07,
"loss": 0.0659,
"step": 1016
},
{
"epoch": 1.7092436974789917,
"grad_norm": 2.659719689119663,
"learning_rate": 5.124785216896854e-07,
"loss": 0.0487,
"step": 1017
},
{
"epoch": 1.7109243697478993,
"grad_norm": 2.807149009390532,
"learning_rate": 5.066729046797692e-07,
"loss": 0.0744,
"step": 1018
},
{
"epoch": 1.7126050420168069,
"grad_norm": 3.7321641841913458,
"learning_rate": 5.008986041977254e-07,
"loss": 0.0722,
"step": 1019
},
{
"epoch": 1.7142857142857144,
"grad_norm": 4.130707731441335,
"learning_rate": 4.951556604879049e-07,
"loss": 0.0788,
"step": 1020
},
{
"epoch": 1.715966386554622,
"grad_norm": 3.0918431678717972,
"learning_rate": 4.894441135761197e-07,
"loss": 0.0673,
"step": 1021
},
{
"epoch": 1.7176470588235295,
"grad_norm": 2.976439686671259,
"learning_rate": 4.837640032693558e-07,
"loss": 0.0636,
"step": 1022
},
{
"epoch": 1.719327731092437,
"grad_norm": 3.9989069666982906,
"learning_rate": 4.781153691555035e-07,
"loss": 0.1041,
"step": 1023
},
{
"epoch": 1.7210084033613446,
"grad_norm": 2.82983606873746,
"learning_rate": 4.724982506030762e-07,
"loss": 0.0647,
"step": 1024
},
{
"epoch": 1.7226890756302522,
"grad_norm": 3.484959680642574,
"learning_rate": 4.669126867609375e-07,
"loss": 0.0619,
"step": 1025
},
{
"epoch": 1.7243697478991598,
"grad_norm": 3.1730436291690203,
"learning_rate": 4.613587165580269e-07,
"loss": 0.0806,
"step": 1026
},
{
"epoch": 1.7260504201680673,
"grad_norm": 2.6705239615445007,
"learning_rate": 4.5583637870309397e-07,
"loss": 0.0463,
"step": 1027
},
{
"epoch": 1.7277310924369749,
"grad_norm": 2.595728117055968,
"learning_rate": 4.503457116844201e-07,
"loss": 0.0496,
"step": 1028
},
{
"epoch": 1.7294117647058824,
"grad_norm": 2.8656841743678068,
"learning_rate": 4.448867537695578e-07,
"loss": 0.0554,
"step": 1029
},
{
"epoch": 1.73109243697479,
"grad_norm": 3.253801626814162,
"learning_rate": 4.394595430050613e-07,
"loss": 0.0816,
"step": 1030
},
{
"epoch": 1.7327731092436975,
"grad_norm": 3.644665909418401,
"learning_rate": 4.34064117216218e-07,
"loss": 0.0697,
"step": 1031
},
{
"epoch": 1.734453781512605,
"grad_norm": 3.8362438500108507,
"learning_rate": 4.287005140067912e-07,
"loss": 0.0886,
"step": 1032
},
{
"epoch": 1.7361344537815127,
"grad_norm": 3.1234733214287402,
"learning_rate": 4.2336877075875136e-07,
"loss": 0.0612,
"step": 1033
},
{
"epoch": 1.7378151260504202,
"grad_norm": 2.386823444604084,
"learning_rate": 4.1806892463202353e-07,
"loss": 0.0407,
"step": 1034
},
{
"epoch": 1.7394957983193278,
"grad_norm": 2.802793611918228,
"learning_rate": 4.1280101256421903e-07,
"loss": 0.0522,
"step": 1035
},
{
"epoch": 1.7411764705882353,
"grad_norm": 2.4401900894075172,
"learning_rate": 4.0756507127038494e-07,
"loss": 0.0574,
"step": 1036
},
{
"epoch": 1.7428571428571429,
"grad_norm": 3.475878310133994,
"learning_rate": 4.0236113724274716e-07,
"loss": 0.0778,
"step": 1037
},
{
"epoch": 1.7445378151260504,
"grad_norm": 3.9184928910191963,
"learning_rate": 3.971892467504518e-07,
"loss": 0.0935,
"step": 1038
},
{
"epoch": 1.746218487394958,
"grad_norm": 4.216121004493787,
"learning_rate": 3.9204943583931953e-07,
"loss": 0.0713,
"step": 1039
},
{
"epoch": 1.7478991596638656,
"grad_norm": 3.0902619137106115,
"learning_rate": 3.869417403315856e-07,
"loss": 0.07,
"step": 1040
},
{
"epoch": 1.749579831932773,
"grad_norm": 3.5215787238811656,
"learning_rate": 3.8186619582565974e-07,
"loss": 0.0612,
"step": 1041
},
{
"epoch": 1.7512605042016807,
"grad_norm": 3.068365157929383,
"learning_rate": 3.7682283769586883e-07,
"loss": 0.0598,
"step": 1042
},
{
"epoch": 1.7529411764705882,
"grad_norm": 3.2060690731093295,
"learning_rate": 3.71811701092219e-07,
"loss": 0.0622,
"step": 1043
},
{
"epoch": 1.7546218487394958,
"grad_norm": 4.08757596782293,
"learning_rate": 3.6683282094014285e-07,
"loss": 0.0893,
"step": 1044
},
{
"epoch": 1.7563025210084033,
"grad_norm": 2.6145817155772684,
"learning_rate": 3.6188623194026105e-07,
"loss": 0.0519,
"step": 1045
},
{
"epoch": 1.757983193277311,
"grad_norm": 2.914336649954542,
"learning_rate": 3.569719685681405e-07,
"loss": 0.0527,
"step": 1046
},
{
"epoch": 1.7596638655462185,
"grad_norm": 3.7177435770144847,
"learning_rate": 3.5209006507404883e-07,
"loss": 0.0668,
"step": 1047
},
{
"epoch": 1.761344537815126,
"grad_norm": 2.607571098443921,
"learning_rate": 3.4724055548272406e-07,
"loss": 0.0498,
"step": 1048
},
{
"epoch": 1.7630252100840336,
"grad_norm": 4.459090476779105,
"learning_rate": 3.4242347359312864e-07,
"loss": 0.076,
"step": 1049
},
{
"epoch": 1.7647058823529411,
"grad_norm": 5.168276264230189,
"learning_rate": 3.3763885297822153e-07,
"loss": 0.0654,
"step": 1050
},
{
"epoch": 1.7663865546218487,
"grad_norm": 2.9922484939596576,
"learning_rate": 3.3288672698471804e-07,
"loss": 0.0583,
"step": 1051
},
{
"epoch": 1.7680672268907562,
"grad_norm": 3.083684278623569,
"learning_rate": 3.281671287328614e-07,
"loss": 0.0646,
"step": 1052
},
{
"epoch": 1.7697478991596638,
"grad_norm": 3.620493118528574,
"learning_rate": 3.2348009111619227e-07,
"loss": 0.0948,
"step": 1053
},
{
"epoch": 1.7714285714285714,
"grad_norm": 2.7564111302020544,
"learning_rate": 3.18825646801314e-07,
"loss": 0.0462,
"step": 1054
},
{
"epoch": 1.773109243697479,
"grad_norm": 2.8782377534767076,
"learning_rate": 3.1420382822767326e-07,
"loss": 0.0502,
"step": 1055
},
{
"epoch": 1.7747899159663865,
"grad_norm": 3.4400037527416836,
"learning_rate": 3.096146676073253e-07,
"loss": 0.0792,
"step": 1056
},
{
"epoch": 1.776470588235294,
"grad_norm": 2.6901880238607814,
"learning_rate": 3.0505819692471797e-07,
"loss": 0.0593,
"step": 1057
},
{
"epoch": 1.7781512605042016,
"grad_norm": 3.6125322665717046,
"learning_rate": 3.0053444793646024e-07,
"loss": 0.066,
"step": 1058
},
{
"epoch": 1.7798319327731091,
"grad_norm": 3.3356619840985164,
"learning_rate": 2.960434521711086e-07,
"loss": 0.0495,
"step": 1059
},
{
"epoch": 1.7815126050420167,
"grad_norm": 2.449842215488209,
"learning_rate": 2.915852409289421e-07,
"loss": 0.0584,
"step": 1060
},
{
"epoch": 1.7831932773109243,
"grad_norm": 3.8717672742770795,
"learning_rate": 2.8715984528174757e-07,
"loss": 0.0611,
"step": 1061
},
{
"epoch": 1.7848739495798318,
"grad_norm": 3.2030831698927975,
"learning_rate": 2.827672960725991e-07,
"loss": 0.0913,
"step": 1062
},
{
"epoch": 1.7865546218487394,
"grad_norm": 3.865848748318045,
"learning_rate": 2.7840762391564634e-07,
"loss": 0.0762,
"step": 1063
},
{
"epoch": 1.788235294117647,
"grad_norm": 3.4085943532702103,
"learning_rate": 2.7408085919590265e-07,
"loss": 0.0552,
"step": 1064
},
{
"epoch": 1.7899159663865545,
"grad_norm": 3.1995440473790335,
"learning_rate": 2.697870320690266e-07,
"loss": 0.0502,
"step": 1065
},
{
"epoch": 1.791596638655462,
"grad_norm": 3.7669348300021124,
"learning_rate": 2.6552617246111966e-07,
"loss": 0.0574,
"step": 1066
},
{
"epoch": 1.7932773109243696,
"grad_norm": 3.3293423324211098,
"learning_rate": 2.612983100685118e-07,
"loss": 0.0518,
"step": 1067
},
{
"epoch": 1.7949579831932772,
"grad_norm": 6.432997589961303,
"learning_rate": 2.5710347435755955e-07,
"loss": 0.0604,
"step": 1068
},
{
"epoch": 1.7966386554621847,
"grad_norm": 3.3108833882683464,
"learning_rate": 2.5294169456443416e-07,
"loss": 0.056,
"step": 1069
},
{
"epoch": 1.7983193277310925,
"grad_norm": 2.9734777133032835,
"learning_rate": 2.4881299969492514e-07,
"loss": 0.0646,
"step": 1070
},
{
"epoch": 1.8,
"grad_norm": 2.860144532702313,
"learning_rate": 2.447174185242324e-07,
"loss": 0.0727,
"step": 1071
},
{
"epoch": 1.8016806722689076,
"grad_norm": 4.282160091271182,
"learning_rate": 2.406549795967678e-07,
"loss": 0.067,
"step": 1072
},
{
"epoch": 1.8033613445378152,
"grad_norm": 3.3754058094707937,
"learning_rate": 2.366257112259579e-07,
"loss": 0.0581,
"step": 1073
},
{
"epoch": 1.8050420168067227,
"grad_norm": 2.306814130707301,
"learning_rate": 2.3262964149404322e-07,
"loss": 0.0531,
"step": 1074
},
{
"epoch": 1.8067226890756303,
"grad_norm": 2.9409070748310935,
"learning_rate": 2.286667982518853e-07,
"loss": 0.0639,
"step": 1075
},
{
"epoch": 1.8084033613445378,
"grad_norm": 3.0396807188768644,
"learning_rate": 2.247372091187705e-07,
"loss": 0.0603,
"step": 1076
},
{
"epoch": 1.8100840336134454,
"grad_norm": 4.04313014248122,
"learning_rate": 2.2084090148221937e-07,
"loss": 0.0986,
"step": 1077
},
{
"epoch": 1.811764705882353,
"grad_norm": 3.0115016227322697,
"learning_rate": 2.1697790249779638e-07,
"loss": 0.0681,
"step": 1078
},
{
"epoch": 1.8134453781512605,
"grad_norm": 3.2168149498398533,
"learning_rate": 2.1314823908891558e-07,
"loss": 0.0611,
"step": 1079
},
{
"epoch": 1.815126050420168,
"grad_norm": 3.830162600946003,
"learning_rate": 2.093519379466602e-07,
"loss": 0.0881,
"step": 1080
},
{
"epoch": 1.8168067226890756,
"grad_norm": 2.4163130625409672,
"learning_rate": 2.0558902552959058e-07,
"loss": 0.0647,
"step": 1081
},
{
"epoch": 1.8184873949579832,
"grad_norm": 3.139900286939481,
"learning_rate": 2.018595280635638e-07,
"loss": 0.0537,
"step": 1082
},
{
"epoch": 1.8201680672268907,
"grad_norm": 3.1466684302589423,
"learning_rate": 1.981634715415487e-07,
"loss": 0.0523,
"step": 1083
},
{
"epoch": 1.8218487394957983,
"grad_norm": 3.9632174225214984,
"learning_rate": 1.945008817234445e-07,
"loss": 0.0916,
"step": 1084
},
{
"epoch": 1.8235294117647058,
"grad_norm": 3.36248537702875,
"learning_rate": 1.908717841359048e-07,
"loss": 0.0558,
"step": 1085
},
{
"epoch": 1.8252100840336134,
"grad_norm": 3.39348312293213,
"learning_rate": 1.8727620407215375e-07,
"loss": 0.0439,
"step": 1086
},
{
"epoch": 1.826890756302521,
"grad_norm": 4.032088455291679,
"learning_rate": 1.837141665918152e-07,
"loss": 0.0771,
"step": 1087
},
{
"epoch": 1.8285714285714287,
"grad_norm": 3.4190633994412836,
"learning_rate": 1.801856965207338e-07,
"loss": 0.0563,
"step": 1088
},
{
"epoch": 1.8302521008403363,
"grad_norm": 3.534886276176037,
"learning_rate": 1.7669081845080648e-07,
"loss": 0.0692,
"step": 1089
},
{
"epoch": 1.8319327731092439,
"grad_norm": 3.4435868355700174,
"learning_rate": 1.7322955673980678e-07,
"loss": 0.0625,
"step": 1090
},
{
"epoch": 1.8336134453781514,
"grad_norm": 3.241199652941816,
"learning_rate": 1.6980193551121848e-07,
"loss": 0.0741,
"step": 1091
},
{
"epoch": 1.835294117647059,
"grad_norm": 2.800413218064161,
"learning_rate": 1.664079786540629e-07,
"loss": 0.0684,
"step": 1092
},
{
"epoch": 1.8369747899159665,
"grad_norm": 3.8889142045528775,
"learning_rate": 1.6304770982273898e-07,
"loss": 0.0871,
"step": 1093
},
{
"epoch": 1.838655462184874,
"grad_norm": 2.996815076712509,
"learning_rate": 1.597211524368536e-07,
"loss": 0.0587,
"step": 1094
},
{
"epoch": 1.8403361344537816,
"grad_norm": 3.0926722750899023,
"learning_rate": 1.564283296810576e-07,
"loss": 0.0551,
"step": 1095
},
{
"epoch": 1.8420168067226892,
"grad_norm": 3.5055235248239627,
"learning_rate": 1.5316926450488878e-07,
"loss": 0.0633,
"step": 1096
},
{
"epoch": 1.8436974789915967,
"grad_norm": 2.663472736681946,
"learning_rate": 1.499439796226082e-07,
"loss": 0.0619,
"step": 1097
},
{
"epoch": 1.8453781512605043,
"grad_norm": 3.953570927026477,
"learning_rate": 1.4675249751304353e-07,
"loss": 0.078,
"step": 1098
},
{
"epoch": 1.8470588235294119,
"grad_norm": 2.493322146525807,
"learning_rate": 1.435948404194304e-07,
"loss": 0.0368,
"step": 1099
},
{
"epoch": 1.8487394957983194,
"grad_norm": 2.3023658930459403,
"learning_rate": 1.404710303492618e-07,
"loss": 0.0426,
"step": 1100
},
{
"epoch": 1.850420168067227,
"grad_norm": 2.97412848554832,
"learning_rate": 1.373810890741284e-07,
"loss": 0.0581,
"step": 1101
},
{
"epoch": 1.8521008403361345,
"grad_norm": 3.74074771238985,
"learning_rate": 1.3432503812957242e-07,
"loss": 0.07,
"step": 1102
},
{
"epoch": 1.853781512605042,
"grad_norm": 3.4159524020499354,
"learning_rate": 1.3130289881493452e-07,
"loss": 0.1028,
"step": 1103
},
{
"epoch": 1.8554621848739496,
"grad_norm": 3.4247555155007534,
"learning_rate": 1.2831469219320603e-07,
"loss": 0.075,
"step": 1104
},
{
"epoch": 1.8571428571428572,
"grad_norm": 3.3014047371850235,
"learning_rate": 1.253604390908819e-07,
"loss": 0.0744,
"step": 1105
},
{
"epoch": 1.8588235294117648,
"grad_norm": 2.920013877825399,
"learning_rate": 1.22440160097817e-07,
"loss": 0.0579,
"step": 1106
},
{
"epoch": 1.8605042016806723,
"grad_norm": 3.7369832957086153,
"learning_rate": 1.1955387556708e-07,
"loss": 0.0542,
"step": 1107
},
{
"epoch": 1.8621848739495799,
"grad_norm": 2.2666088356536345,
"learning_rate": 1.1670160561481458e-07,
"loss": 0.0371,
"step": 1108
},
{
"epoch": 1.8638655462184874,
"grad_norm": 2.674691850295334,
"learning_rate": 1.1388337012009643e-07,
"loss": 0.0507,
"step": 1109
},
{
"epoch": 1.865546218487395,
"grad_norm": 4.010100748714858,
"learning_rate": 1.1109918872479642e-07,
"loss": 0.1174,
"step": 1110
},
{
"epoch": 1.8672268907563025,
"grad_norm": 2.892673598502158,
"learning_rate": 1.0834908083344253e-07,
"loss": 0.0548,
"step": 1111
},
{
"epoch": 1.86890756302521,
"grad_norm": 3.1318654544587696,
"learning_rate": 1.0563306561308773e-07,
"loss": 0.0626,
"step": 1112
},
{
"epoch": 1.8705882352941177,
"grad_norm": 2.6308962683101083,
"learning_rate": 1.0295116199317057e-07,
"loss": 0.049,
"step": 1113
},
{
"epoch": 1.8722689075630252,
"grad_norm": 3.2429908938438805,
"learning_rate": 1.0030338866538925e-07,
"loss": 0.0965,
"step": 1114
},
{
"epoch": 1.8739495798319328,
"grad_norm": 2.8503852447249067,
"learning_rate": 9.768976408356667e-08,
"loss": 0.07,
"step": 1115
},
{
"epoch": 1.8756302521008403,
"grad_norm": 2.677806287085677,
"learning_rate": 9.511030646352615e-08,
"loss": 0.053,
"step": 1116
},
{
"epoch": 1.877310924369748,
"grad_norm": 3.1946750075620844,
"learning_rate": 9.256503378295978e-08,
"loss": 0.0651,
"step": 1117
},
{
"epoch": 1.8789915966386554,
"grad_norm": 3.584179246441202,
"learning_rate": 9.005396378130748e-08,
"loss": 0.0791,
"step": 1118
},
{
"epoch": 1.880672268907563,
"grad_norm": 3.61746694704659,
"learning_rate": 8.757711395963097e-08,
"loss": 0.0793,
"step": 1119
},
{
"epoch": 1.8823529411764706,
"grad_norm": 4.645088603678588,
"learning_rate": 8.513450158049109e-08,
"loss": 0.1481,
"step": 1120
},
{
"epoch": 1.8840336134453781,
"grad_norm": 4.909166462690628,
"learning_rate": 8.27261436678306e-08,
"loss": 0.0949,
"step": 1121
},
{
"epoch": 1.8857142857142857,
"grad_norm": 3.129071533007231,
"learning_rate": 8.035205700685167e-08,
"loss": 0.0566,
"step": 1122
},
{
"epoch": 1.8873949579831932,
"grad_norm": 3.657589121538267,
"learning_rate": 7.801225814390245e-08,
"loss": 0.067,
"step": 1123
},
{
"epoch": 1.8890756302521008,
"grad_norm": 3.694014323127052,
"learning_rate": 7.570676338635896e-08,
"loss": 0.0838,
"step": 1124
},
{
"epoch": 1.8907563025210083,
"grad_norm": 3.6696526234853604,
"learning_rate": 7.343558880251289e-08,
"loss": 0.0768,
"step": 1125
},
{
"epoch": 1.892436974789916,
"grad_norm": 3.3985635955632123,
"learning_rate": 7.11987502214595e-08,
"loss": 0.0538,
"step": 1126
},
{
"epoch": 1.8941176470588235,
"grad_norm": 2.9132010200327234,
"learning_rate": 6.899626323298714e-08,
"loss": 0.052,
"step": 1127
},
{
"epoch": 1.895798319327731,
"grad_norm": 3.029727226881479,
"learning_rate": 6.682814318746844e-08,
"loss": 0.0461,
"step": 1128
},
{
"epoch": 1.8974789915966386,
"grad_norm": 3.0897927655523967,
"learning_rate": 6.46944051957532e-08,
"loss": 0.059,
"step": 1129
},
{
"epoch": 1.8991596638655461,
"grad_norm": 2.497103912549752,
"learning_rate": 6.259506412906402e-08,
"loss": 0.0569,
"step": 1130
},
{
"epoch": 1.9008403361344537,
"grad_norm": 4.753093028355241,
"learning_rate": 6.053013461889023e-08,
"loss": 0.066,
"step": 1131
},
{
"epoch": 1.9025210084033612,
"grad_norm": 2.744729456009063,
"learning_rate": 5.849963105689027e-08,
"loss": 0.0673,
"step": 1132
},
{
"epoch": 1.9042016806722688,
"grad_norm": 3.6223706521954235,
"learning_rate": 5.65035675947867e-08,
"loss": 0.0719,
"step": 1133
},
{
"epoch": 1.9058823529411764,
"grad_norm": 2.6260713125590813,
"learning_rate": 5.454195814427021e-08,
"loss": 0.0482,
"step": 1134
},
{
"epoch": 1.907563025210084,
"grad_norm": 3.977427947617622,
"learning_rate": 5.261481637690247e-08,
"loss": 0.0488,
"step": 1135
},
{
"epoch": 1.9092436974789915,
"grad_norm": 2.0514624582452434,
"learning_rate": 5.072215572402006e-08,
"loss": 0.038,
"step": 1136
},
{
"epoch": 1.910924369747899,
"grad_norm": 3.2121969818313083,
"learning_rate": 4.886398937664127e-08,
"loss": 0.0658,
"step": 1137
},
{
"epoch": 1.9126050420168066,
"grad_norm": 3.2196806088420393,
"learning_rate": 4.704033028537391e-08,
"loss": 0.0496,
"step": 1138
},
{
"epoch": 1.9142857142857141,
"grad_norm": 3.2967972931881713,
"learning_rate": 4.52511911603265e-08,
"loss": 0.0734,
"step": 1139
},
{
"epoch": 1.9159663865546217,
"grad_norm": 2.542731148202116,
"learning_rate": 4.3496584471016125e-08,
"loss": 0.0403,
"step": 1140
},
{
"epoch": 1.9176470588235293,
"grad_norm": 3.152363972853614,
"learning_rate": 4.177652244628627e-08,
"loss": 0.0438,
"step": 1141
},
{
"epoch": 1.9193277310924368,
"grad_norm": 3.166435459793863,
"learning_rate": 4.009101707421803e-08,
"loss": 0.0765,
"step": 1142
},
{
"epoch": 1.9210084033613444,
"grad_norm": 3.04932577570153,
"learning_rate": 3.8440080102047364e-08,
"loss": 0.0641,
"step": 1143
},
{
"epoch": 1.9226890756302522,
"grad_norm": 3.031257181815333,
"learning_rate": 3.6823723036084616e-08,
"loss": 0.0508,
"step": 1144
},
{
"epoch": 1.9243697478991597,
"grad_norm": 3.1891638467360695,
"learning_rate": 3.5241957141632923e-08,
"loss": 0.0579,
"step": 1145
},
{
"epoch": 1.9260504201680673,
"grad_norm": 4.517312973719181,
"learning_rate": 3.369479344290938e-08,
"loss": 0.0876,
"step": 1146
},
{
"epoch": 1.9277310924369748,
"grad_norm": 3.700818940451555,
"learning_rate": 3.218224272296955e-08,
"loss": 0.0518,
"step": 1147
},
{
"epoch": 1.9294117647058824,
"grad_norm": 2.346728135463526,
"learning_rate": 3.0704315523631956e-08,
"loss": 0.0487,
"step": 1148
},
{
"epoch": 1.93109243697479,
"grad_norm": 3.167266611349493,
"learning_rate": 2.926102214540316e-08,
"loss": 0.0566,
"step": 1149
},
{
"epoch": 1.9327731092436975,
"grad_norm": 2.860221027996709,
"learning_rate": 2.7852372647407812e-08,
"loss": 0.0593,
"step": 1150
},
{
"epoch": 1.934453781512605,
"grad_norm": 4.135973604969323,
"learning_rate": 2.6478376847318687e-08,
"loss": 0.074,
"step": 1151
},
{
"epoch": 1.9361344537815126,
"grad_norm": 3.341556979644147,
"learning_rate": 2.5139044321286223e-08,
"loss": 0.0699,
"step": 1152
},
{
"epoch": 1.9378151260504202,
"grad_norm": 3.027802548930941,
"learning_rate": 2.383438440387298e-08,
"loss": 0.0709,
"step": 1153
},
{
"epoch": 1.9394957983193277,
"grad_norm": 3.2086989982561107,
"learning_rate": 2.256440618798872e-08,
"loss": 0.0612,
"step": 1154
},
{
"epoch": 1.9411764705882353,
"grad_norm": 3.5321511064341538,
"learning_rate": 2.1329118524827662e-08,
"loss": 0.0562,
"step": 1155
},
{
"epoch": 1.9428571428571428,
"grad_norm": 2.8118478612530735,
"learning_rate": 2.012853002380466e-08,
"loss": 0.0534,
"step": 1156
},
{
"epoch": 1.9445378151260504,
"grad_norm": 3.432208698670532,
"learning_rate": 1.896264905249856e-08,
"loss": 0.0837,
"step": 1157
},
{
"epoch": 1.946218487394958,
"grad_norm": 3.21605104376969,
"learning_rate": 1.783148373659005e-08,
"loss": 0.0506,
"step": 1158
},
{
"epoch": 1.9478991596638655,
"grad_norm": 3.896377413465593,
"learning_rate": 1.6735041959806686e-08,
"loss": 0.075,
"step": 1159
},
{
"epoch": 1.949579831932773,
"grad_norm": 3.519301914030783,
"learning_rate": 1.567333136387017e-08,
"loss": 0.0726,
"step": 1160
},
{
"epoch": 1.9512605042016806,
"grad_norm": 3.6292612141416334,
"learning_rate": 1.4646359348439165e-08,
"loss": 0.0706,
"step": 1161
},
{
"epoch": 1.9529411764705882,
"grad_norm": 3.132655139876115,
"learning_rate": 1.3654133071059894e-08,
"loss": 0.0564,
"step": 1162
},
{
"epoch": 1.954621848739496,
"grad_norm": 2.404923639354769,
"learning_rate": 1.2696659447116732e-08,
"loss": 0.0453,
"step": 1163
},
{
"epoch": 1.9563025210084035,
"grad_norm": 2.759015592903886,
"learning_rate": 1.1773945149782805e-08,
"loss": 0.0592,
"step": 1164
},
{
"epoch": 1.957983193277311,
"grad_norm": 2.932560579044183,
"learning_rate": 1.088599660997336e-08,
"loss": 0.0489,
"step": 1165
},
{
"epoch": 1.9596638655462186,
"grad_norm": 3.0306047589144036,
"learning_rate": 1.0032820016302458e-08,
"loss": 0.0458,
"step": 1166
},
{
"epoch": 1.9613445378151262,
"grad_norm": 4.176058096232488,
"learning_rate": 9.21442131503858e-09,
"loss": 0.0638,
"step": 1167
},
{
"epoch": 1.9630252100840337,
"grad_norm": 3.26158879531482,
"learning_rate": 8.430806210062426e-09,
"loss": 0.0583,
"step": 1168
},
{
"epoch": 1.9647058823529413,
"grad_norm": 2.606267093967038,
"learning_rate": 7.681980162830283e-09,
"loss": 0.0536,
"step": 1169
},
{
"epoch": 1.9663865546218489,
"grad_norm": 3.1278293687636625,
"learning_rate": 6.9679483923318356e-09,
"loss": 0.0747,
"step": 1170
},
{
"epoch": 1.9680672268907564,
"grad_norm": 3.290980715214881,
"learning_rate": 6.288715875057416e-09,
"loss": 0.0778,
"step": 1171
},
{
"epoch": 1.969747899159664,
"grad_norm": 3.044737695675404,
"learning_rate": 5.644287344960253e-09,
"loss": 0.0566,
"step": 1172
},
{
"epoch": 1.9714285714285715,
"grad_norm": 2.946504192696155,
"learning_rate": 5.034667293427053e-09,
"loss": 0.0629,
"step": 1173
},
{
"epoch": 1.973109243697479,
"grad_norm": 3.186912828675924,
"learning_rate": 4.45985996924192e-09,
"loss": 0.0803,
"step": 1174
},
{
"epoch": 1.9747899159663866,
"grad_norm": 3.892275647593651,
"learning_rate": 3.919869378561925e-09,
"loss": 0.0805,
"step": 1175
},
{
"epoch": 1.9764705882352942,
"grad_norm": 3.389822194680044,
"learning_rate": 3.41469928488547e-09,
"loss": 0.0566,
"step": 1176
},
{
"epoch": 1.9781512605042018,
"grad_norm": 2.5650320359231693,
"learning_rate": 2.9443532090273064e-09,
"loss": 0.0613,
"step": 1177
},
{
"epoch": 1.9798319327731093,
"grad_norm": 3.010639961534959,
"learning_rate": 2.508834429094664e-09,
"loss": 0.0699,
"step": 1178
},
{
"epoch": 1.9815126050420169,
"grad_norm": 2.5014084566068804,
"learning_rate": 2.108145980462828e-09,
"loss": 0.0504,
"step": 1179
},
{
"epoch": 1.9831932773109244,
"grad_norm": 3.9182990233667945,
"learning_rate": 1.7422906557557074e-09,
"loss": 0.0827,
"step": 1180
},
{
"epoch": 1.984873949579832,
"grad_norm": 3.507465961115335,
"learning_rate": 1.4112710048247436e-09,
"loss": 0.0854,
"step": 1181
},
{
"epoch": 1.9865546218487395,
"grad_norm": 3.0650868400573907,
"learning_rate": 1.1150893347328107e-09,
"loss": 0.0657,
"step": 1182
},
{
"epoch": 1.988235294117647,
"grad_norm": 2.9055910306096964,
"learning_rate": 8.537477097364522e-10,
"loss": 0.0624,
"step": 1183
},
{
"epoch": 1.9899159663865547,
"grad_norm": 4.616023305998801,
"learning_rate": 6.272479512731133e-10,
"loss": 0.056,
"step": 1184
},
{
"epoch": 1.9915966386554622,
"grad_norm": 2.6716933754582874,
"learning_rate": 4.3559163794670844e-10,
"loss": 0.0567,
"step": 1185
},
{
"epoch": 1.9932773109243698,
"grad_norm": 3.4964821514974602,
"learning_rate": 2.787801055181838e-10,
"loss": 0.0722,
"step": 1186
},
{
"epoch": 1.9949579831932773,
"grad_norm": 2.654518550392596,
"learning_rate": 1.568144468955257e-10,
"loss": 0.0604,
"step": 1187
},
{
"epoch": 1.9966386554621849,
"grad_norm": 3.8620551224683424,
"learning_rate": 6.969551212598901e-11,
"loss": 0.098,
"step": 1188
},
{
"epoch": 1.9983193277310924,
"grad_norm": 3.276316686896461,
"learning_rate": 1.7423908390545862e-11,
"loss": 0.0731,
"step": 1189
},
{
"epoch": 2.0,
"grad_norm": 2.6008822542366743,
"learning_rate": 0.0,
"loss": 0.0496,
"step": 1190
},
{
"epoch": 2.0,
"step": 1190,
"total_flos": 1781687255040.0,
"train_loss": 0.14153178428464075,
"train_runtime": 543.0923,
"train_samples_per_second": 17.515,
"train_steps_per_second": 2.191
}
],
"logging_steps": 1,
"max_steps": 1190,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1781687255040.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}