random_YUPss8MMFv8obvkZ / trainer_state.json
cutelemonlili's picture
Add files using upload-large-folder tool
f7a2e17 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1358,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014727540500736377,
"grad_norm": 4.94458948599831,
"learning_rate": 9.999986620495792e-06,
"loss": 0.3378,
"step": 1
},
{
"epoch": 0.0029455081001472753,
"grad_norm": 4.166056018399312,
"learning_rate": 9.999946482054771e-06,
"loss": 0.3049,
"step": 2
},
{
"epoch": 0.004418262150220913,
"grad_norm": 3.840790925563368,
"learning_rate": 9.999879584891754e-06,
"loss": 0.3079,
"step": 3
},
{
"epoch": 0.005891016200294551,
"grad_norm": 3.4823512343952325,
"learning_rate": 9.999785929364756e-06,
"loss": 0.278,
"step": 4
},
{
"epoch": 0.007363770250368188,
"grad_norm": 3.6374829783923586,
"learning_rate": 9.999665515975005e-06,
"loss": 0.2717,
"step": 5
},
{
"epoch": 0.008836524300441826,
"grad_norm": 3.8692984807597726,
"learning_rate": 9.999518345366933e-06,
"loss": 0.3032,
"step": 6
},
{
"epoch": 0.010309278350515464,
"grad_norm": 3.246353468882525,
"learning_rate": 9.999344418328161e-06,
"loss": 0.2181,
"step": 7
},
{
"epoch": 0.011782032400589101,
"grad_norm": 3.3637880240853746,
"learning_rate": 9.999143735789518e-06,
"loss": 0.2504,
"step": 8
},
{
"epoch": 0.013254786450662739,
"grad_norm": 2.84021111636833,
"learning_rate": 9.998916298825015e-06,
"loss": 0.2165,
"step": 9
},
{
"epoch": 0.014727540500736377,
"grad_norm": 2.9675485045739736,
"learning_rate": 9.99866210865185e-06,
"loss": 0.2166,
"step": 10
},
{
"epoch": 0.016200294550810016,
"grad_norm": 3.673938624977175,
"learning_rate": 9.998381166630395e-06,
"loss": 0.257,
"step": 11
},
{
"epoch": 0.017673048600883652,
"grad_norm": 3.386111654135163,
"learning_rate": 9.9980734742642e-06,
"loss": 0.2725,
"step": 12
},
{
"epoch": 0.01914580265095729,
"grad_norm": 3.1832497014707113,
"learning_rate": 9.997739033199974e-06,
"loss": 0.2325,
"step": 13
},
{
"epoch": 0.020618556701030927,
"grad_norm": 2.437336524322532,
"learning_rate": 9.997377845227577e-06,
"loss": 0.183,
"step": 14
},
{
"epoch": 0.022091310751104567,
"grad_norm": 2.7443979256988675,
"learning_rate": 9.996989912280015e-06,
"loss": 0.2183,
"step": 15
},
{
"epoch": 0.023564064801178203,
"grad_norm": 2.759798152580142,
"learning_rate": 9.996575236433428e-06,
"loss": 0.1976,
"step": 16
},
{
"epoch": 0.025036818851251842,
"grad_norm": 3.712743145586044,
"learning_rate": 9.996133819907082e-06,
"loss": 0.3022,
"step": 17
},
{
"epoch": 0.026509572901325478,
"grad_norm": 3.515452198275458,
"learning_rate": 9.995665665063349e-06,
"loss": 0.2521,
"step": 18
},
{
"epoch": 0.027982326951399118,
"grad_norm": 3.2896891836927065,
"learning_rate": 9.9951707744077e-06,
"loss": 0.2439,
"step": 19
},
{
"epoch": 0.029455081001472753,
"grad_norm": 3.1762823618397507,
"learning_rate": 9.994649150588694e-06,
"loss": 0.2565,
"step": 20
},
{
"epoch": 0.030927835051546393,
"grad_norm": 2.8582894091949735,
"learning_rate": 9.994100796397954e-06,
"loss": 0.201,
"step": 21
},
{
"epoch": 0.03240058910162003,
"grad_norm": 3.180040249667404,
"learning_rate": 9.993525714770167e-06,
"loss": 0.2983,
"step": 22
},
{
"epoch": 0.033873343151693665,
"grad_norm": 3.798894109532273,
"learning_rate": 9.992923908783054e-06,
"loss": 0.2578,
"step": 23
},
{
"epoch": 0.035346097201767304,
"grad_norm": 3.3577639769979917,
"learning_rate": 9.992295381657361e-06,
"loss": 0.2551,
"step": 24
},
{
"epoch": 0.036818851251840944,
"grad_norm": 3.027011475769264,
"learning_rate": 9.991640136756843e-06,
"loss": 0.2394,
"step": 25
},
{
"epoch": 0.03829160530191458,
"grad_norm": 3.803089146754714,
"learning_rate": 9.990958177588236e-06,
"loss": 0.2749,
"step": 26
},
{
"epoch": 0.039764359351988215,
"grad_norm": 2.714001785750428,
"learning_rate": 9.990249507801257e-06,
"loss": 0.2391,
"step": 27
},
{
"epoch": 0.041237113402061855,
"grad_norm": 3.6859042183737447,
"learning_rate": 9.98951413118856e-06,
"loss": 0.2897,
"step": 28
},
{
"epoch": 0.042709867452135494,
"grad_norm": 3.45627286944561,
"learning_rate": 9.988752051685736e-06,
"loss": 0.2453,
"step": 29
},
{
"epoch": 0.044182621502209134,
"grad_norm": 3.5275255658767093,
"learning_rate": 9.987963273371287e-06,
"loss": 0.2442,
"step": 30
},
{
"epoch": 0.045655375552282766,
"grad_norm": 2.8285444922182967,
"learning_rate": 9.987147800466593e-06,
"loss": 0.2494,
"step": 31
},
{
"epoch": 0.047128129602356406,
"grad_norm": 2.945285023713762,
"learning_rate": 9.986305637335907e-06,
"loss": 0.2292,
"step": 32
},
{
"epoch": 0.048600883652430045,
"grad_norm": 2.5322659450463547,
"learning_rate": 9.985436788486317e-06,
"loss": 0.1992,
"step": 33
},
{
"epoch": 0.050073637702503684,
"grad_norm": 3.264319463862271,
"learning_rate": 9.984541258567732e-06,
"loss": 0.2289,
"step": 34
},
{
"epoch": 0.05154639175257732,
"grad_norm": 3.231647823081557,
"learning_rate": 9.983619052372847e-06,
"loss": 0.2446,
"step": 35
},
{
"epoch": 0.053019145802650956,
"grad_norm": 2.5793360921976465,
"learning_rate": 9.982670174837131e-06,
"loss": 0.2099,
"step": 36
},
{
"epoch": 0.054491899852724596,
"grad_norm": 4.189833287500822,
"learning_rate": 9.981694631038785e-06,
"loss": 0.2302,
"step": 37
},
{
"epoch": 0.055964653902798235,
"grad_norm": 2.822296706691291,
"learning_rate": 9.980692426198728e-06,
"loss": 0.1954,
"step": 38
},
{
"epoch": 0.05743740795287187,
"grad_norm": 3.517010072729644,
"learning_rate": 9.97966356568056e-06,
"loss": 0.282,
"step": 39
},
{
"epoch": 0.05891016200294551,
"grad_norm": 2.9747996462515527,
"learning_rate": 9.97860805499054e-06,
"loss": 0.2311,
"step": 40
},
{
"epoch": 0.060382916053019146,
"grad_norm": 3.571132661443482,
"learning_rate": 9.977525899777549e-06,
"loss": 0.2048,
"step": 41
},
{
"epoch": 0.061855670103092786,
"grad_norm": 2.8914392164923757,
"learning_rate": 9.97641710583307e-06,
"loss": 0.228,
"step": 42
},
{
"epoch": 0.06332842415316642,
"grad_norm": 3.342268753640644,
"learning_rate": 9.97528167909115e-06,
"loss": 0.2652,
"step": 43
},
{
"epoch": 0.06480117820324006,
"grad_norm": 2.733501107730972,
"learning_rate": 9.97411962562836e-06,
"loss": 0.1933,
"step": 44
},
{
"epoch": 0.0662739322533137,
"grad_norm": 3.077042960095706,
"learning_rate": 9.97293095166379e-06,
"loss": 0.1986,
"step": 45
},
{
"epoch": 0.06774668630338733,
"grad_norm": 2.7641800941458365,
"learning_rate": 9.971715663558978e-06,
"loss": 0.2019,
"step": 46
},
{
"epoch": 0.06921944035346098,
"grad_norm": 3.2668699260976894,
"learning_rate": 9.970473767817913e-06,
"loss": 0.2039,
"step": 47
},
{
"epoch": 0.07069219440353461,
"grad_norm": 3.3534010470167703,
"learning_rate": 9.969205271086969e-06,
"loss": 0.2109,
"step": 48
},
{
"epoch": 0.07216494845360824,
"grad_norm": 3.2183136865151187,
"learning_rate": 9.96791018015489e-06,
"loss": 0.3102,
"step": 49
},
{
"epoch": 0.07363770250368189,
"grad_norm": 2.9261416367269426,
"learning_rate": 9.966588501952747e-06,
"loss": 0.1933,
"step": 50
},
{
"epoch": 0.07511045655375552,
"grad_norm": 3.1755105009061326,
"learning_rate": 9.965240243553899e-06,
"loss": 0.2289,
"step": 51
},
{
"epoch": 0.07658321060382917,
"grad_norm": 3.190180430758934,
"learning_rate": 9.963865412173958e-06,
"loss": 0.2387,
"step": 52
},
{
"epoch": 0.0780559646539028,
"grad_norm": 2.752399278611623,
"learning_rate": 9.962464015170748e-06,
"loss": 0.2387,
"step": 53
},
{
"epoch": 0.07952871870397643,
"grad_norm": 3.5795903714534654,
"learning_rate": 9.961036060044268e-06,
"loss": 0.1661,
"step": 54
},
{
"epoch": 0.08100147275405008,
"grad_norm": 2.9565589967050423,
"learning_rate": 9.959581554436654e-06,
"loss": 0.2617,
"step": 55
},
{
"epoch": 0.08247422680412371,
"grad_norm": 3.0373689834666755,
"learning_rate": 9.958100506132127e-06,
"loss": 0.1963,
"step": 56
},
{
"epoch": 0.08394698085419734,
"grad_norm": 2.5982563190319525,
"learning_rate": 9.956592923056965e-06,
"loss": 0.1582,
"step": 57
},
{
"epoch": 0.08541973490427099,
"grad_norm": 3.3173710787447406,
"learning_rate": 9.955058813279454e-06,
"loss": 0.2197,
"step": 58
},
{
"epoch": 0.08689248895434462,
"grad_norm": 3.3097321090388334,
"learning_rate": 9.953498185009846e-06,
"loss": 0.2425,
"step": 59
},
{
"epoch": 0.08836524300441827,
"grad_norm": 2.9050353404780687,
"learning_rate": 9.951911046600313e-06,
"loss": 0.1833,
"step": 60
},
{
"epoch": 0.0898379970544919,
"grad_norm": 3.0423904876869603,
"learning_rate": 9.950297406544907e-06,
"loss": 0.1826,
"step": 61
},
{
"epoch": 0.09131075110456553,
"grad_norm": 3.17541167048525,
"learning_rate": 9.948657273479508e-06,
"loss": 0.211,
"step": 62
},
{
"epoch": 0.09278350515463918,
"grad_norm": 2.962504763876762,
"learning_rate": 9.946990656181782e-06,
"loss": 0.2213,
"step": 63
},
{
"epoch": 0.09425625920471281,
"grad_norm": 3.54508058393369,
"learning_rate": 9.945297563571135e-06,
"loss": 0.218,
"step": 64
},
{
"epoch": 0.09572901325478646,
"grad_norm": 2.59575497239498,
"learning_rate": 9.943578004708664e-06,
"loss": 0.201,
"step": 65
},
{
"epoch": 0.09720176730486009,
"grad_norm": 3.190873302956337,
"learning_rate": 9.941831988797104e-06,
"loss": 0.2394,
"step": 66
},
{
"epoch": 0.09867452135493372,
"grad_norm": 3.126965019920143,
"learning_rate": 9.940059525180788e-06,
"loss": 0.2143,
"step": 67
},
{
"epoch": 0.10014727540500737,
"grad_norm": 2.9885431966842484,
"learning_rate": 9.938260623345591e-06,
"loss": 0.2248,
"step": 68
},
{
"epoch": 0.101620029455081,
"grad_norm": 2.9461738168513083,
"learning_rate": 9.936435292918878e-06,
"loss": 0.1865,
"step": 69
},
{
"epoch": 0.10309278350515463,
"grad_norm": 2.649205090590615,
"learning_rate": 9.934583543669454e-06,
"loss": 0.1835,
"step": 70
},
{
"epoch": 0.10456553755522828,
"grad_norm": 4.042700652793221,
"learning_rate": 9.932705385507514e-06,
"loss": 0.2352,
"step": 71
},
{
"epoch": 0.10603829160530191,
"grad_norm": 3.387539024131993,
"learning_rate": 9.930800828484593e-06,
"loss": 0.2051,
"step": 72
},
{
"epoch": 0.10751104565537556,
"grad_norm": 3.7913024480425865,
"learning_rate": 9.928869882793495e-06,
"loss": 0.2465,
"step": 73
},
{
"epoch": 0.10898379970544919,
"grad_norm": 3.34902917263494,
"learning_rate": 9.926912558768261e-06,
"loss": 0.2331,
"step": 74
},
{
"epoch": 0.11045655375552282,
"grad_norm": 2.4043773466283302,
"learning_rate": 9.924928866884103e-06,
"loss": 0.1843,
"step": 75
},
{
"epoch": 0.11192930780559647,
"grad_norm": 2.5357530889863247,
"learning_rate": 9.922918817757346e-06,
"loss": 0.1752,
"step": 76
},
{
"epoch": 0.1134020618556701,
"grad_norm": 3.2637634737226056,
"learning_rate": 9.920882422145372e-06,
"loss": 0.2313,
"step": 77
},
{
"epoch": 0.11487481590574374,
"grad_norm": 3.731080966458299,
"learning_rate": 9.918819690946568e-06,
"loss": 0.2456,
"step": 78
},
{
"epoch": 0.11634756995581738,
"grad_norm": 3.047609400176811,
"learning_rate": 9.916730635200265e-06,
"loss": 0.2012,
"step": 79
},
{
"epoch": 0.11782032400589101,
"grad_norm": 2.9217369450670705,
"learning_rate": 9.914615266086668e-06,
"loss": 0.2083,
"step": 80
},
{
"epoch": 0.11929307805596466,
"grad_norm": 4.36023190619276,
"learning_rate": 9.912473594926821e-06,
"loss": 0.1813,
"step": 81
},
{
"epoch": 0.12076583210603829,
"grad_norm": 3.258283279415663,
"learning_rate": 9.910305633182518e-06,
"loss": 0.2091,
"step": 82
},
{
"epoch": 0.12223858615611193,
"grad_norm": 3.1772623956205663,
"learning_rate": 9.908111392456263e-06,
"loss": 0.2122,
"step": 83
},
{
"epoch": 0.12371134020618557,
"grad_norm": 2.956141700380078,
"learning_rate": 9.905890884491196e-06,
"loss": 0.2089,
"step": 84
},
{
"epoch": 0.1251840942562592,
"grad_norm": 3.496221823148631,
"learning_rate": 9.903644121171036e-06,
"loss": 0.2418,
"step": 85
},
{
"epoch": 0.12665684830633284,
"grad_norm": 3.75719996287404,
"learning_rate": 9.901371114520014e-06,
"loss": 0.2046,
"step": 86
},
{
"epoch": 0.12812960235640647,
"grad_norm": 3.191941100664362,
"learning_rate": 9.89907187670281e-06,
"loss": 0.217,
"step": 87
},
{
"epoch": 0.12960235640648013,
"grad_norm": 4.028688439687238,
"learning_rate": 9.89674642002449e-06,
"loss": 0.2247,
"step": 88
},
{
"epoch": 0.13107511045655376,
"grad_norm": 3.013392301649945,
"learning_rate": 9.894394756930437e-06,
"loss": 0.2055,
"step": 89
},
{
"epoch": 0.1325478645066274,
"grad_norm": 3.3003674751402143,
"learning_rate": 9.892016900006284e-06,
"loss": 0.2457,
"step": 90
},
{
"epoch": 0.13402061855670103,
"grad_norm": 2.688227321715556,
"learning_rate": 9.889612861977855e-06,
"loss": 0.1833,
"step": 91
},
{
"epoch": 0.13549337260677466,
"grad_norm": 3.5359724671571535,
"learning_rate": 9.887182655711078e-06,
"loss": 0.1991,
"step": 92
},
{
"epoch": 0.13696612665684832,
"grad_norm": 3.3331679118887747,
"learning_rate": 9.884726294211937e-06,
"loss": 0.1987,
"step": 93
},
{
"epoch": 0.13843888070692195,
"grad_norm": 2.3015340391465204,
"learning_rate": 9.882243790626393e-06,
"loss": 0.1772,
"step": 94
},
{
"epoch": 0.13991163475699558,
"grad_norm": 3.581411392511822,
"learning_rate": 9.879735158240314e-06,
"loss": 0.262,
"step": 95
},
{
"epoch": 0.14138438880706922,
"grad_norm": 3.3862669869305124,
"learning_rate": 9.877200410479399e-06,
"loss": 0.2068,
"step": 96
},
{
"epoch": 0.14285714285714285,
"grad_norm": 3.351128575072124,
"learning_rate": 9.874639560909118e-06,
"loss": 0.2399,
"step": 97
},
{
"epoch": 0.14432989690721648,
"grad_norm": 3.032326655318766,
"learning_rate": 9.872052623234632e-06,
"loss": 0.2081,
"step": 98
},
{
"epoch": 0.14580265095729014,
"grad_norm": 3.1356576751257768,
"learning_rate": 9.869439611300712e-06,
"loss": 0.2146,
"step": 99
},
{
"epoch": 0.14727540500736377,
"grad_norm": 2.9170663043693073,
"learning_rate": 9.866800539091688e-06,
"loss": 0.1936,
"step": 100
},
{
"epoch": 0.1487481590574374,
"grad_norm": 3.1762356343512477,
"learning_rate": 9.864135420731345e-06,
"loss": 0.2488,
"step": 101
},
{
"epoch": 0.15022091310751104,
"grad_norm": 3.331979025526921,
"learning_rate": 9.861444270482869e-06,
"loss": 0.2102,
"step": 102
},
{
"epoch": 0.15169366715758467,
"grad_norm": 2.5999400966061357,
"learning_rate": 9.858727102748762e-06,
"loss": 0.1745,
"step": 103
},
{
"epoch": 0.15316642120765833,
"grad_norm": 3.0703393900110902,
"learning_rate": 9.855983932070771e-06,
"loss": 0.1969,
"step": 104
},
{
"epoch": 0.15463917525773196,
"grad_norm": 3.3019426748278256,
"learning_rate": 9.853214773129796e-06,
"loss": 0.2471,
"step": 105
},
{
"epoch": 0.1561119293078056,
"grad_norm": 2.8316407741625222,
"learning_rate": 9.85041964074583e-06,
"loss": 0.1926,
"step": 106
},
{
"epoch": 0.15758468335787923,
"grad_norm": 2.3805838783625792,
"learning_rate": 9.847598549877867e-06,
"loss": 0.1631,
"step": 107
},
{
"epoch": 0.15905743740795286,
"grad_norm": 2.8943998454183517,
"learning_rate": 9.844751515623824e-06,
"loss": 0.2498,
"step": 108
},
{
"epoch": 0.16053019145802652,
"grad_norm": 3.379043461001327,
"learning_rate": 9.841878553220465e-06,
"loss": 0.2663,
"step": 109
},
{
"epoch": 0.16200294550810015,
"grad_norm": 2.71005551644853,
"learning_rate": 9.838979678043314e-06,
"loss": 0.1811,
"step": 110
},
{
"epoch": 0.1634756995581738,
"grad_norm": 2.8109475835050968,
"learning_rate": 9.836054905606578e-06,
"loss": 0.2296,
"step": 111
},
{
"epoch": 0.16494845360824742,
"grad_norm": 3.2874015483825256,
"learning_rate": 9.833104251563058e-06,
"loss": 0.2377,
"step": 112
},
{
"epoch": 0.16642120765832105,
"grad_norm": 2.9367556118169533,
"learning_rate": 9.830127731704067e-06,
"loss": 0.2109,
"step": 113
},
{
"epoch": 0.16789396170839468,
"grad_norm": 3.0606889648156863,
"learning_rate": 9.827125361959353e-06,
"loss": 0.2027,
"step": 114
},
{
"epoch": 0.16936671575846834,
"grad_norm": 3.397990076888804,
"learning_rate": 9.824097158397e-06,
"loss": 0.2227,
"step": 115
},
{
"epoch": 0.17083946980854198,
"grad_norm": 6.011726589982756,
"learning_rate": 9.821043137223356e-06,
"loss": 0.2235,
"step": 116
},
{
"epoch": 0.1723122238586156,
"grad_norm": 3.1315937708088524,
"learning_rate": 9.817963314782934e-06,
"loss": 0.1889,
"step": 117
},
{
"epoch": 0.17378497790868924,
"grad_norm": 2.972045976596521,
"learning_rate": 9.814857707558334e-06,
"loss": 0.1899,
"step": 118
},
{
"epoch": 0.17525773195876287,
"grad_norm": 3.024575764721706,
"learning_rate": 9.811726332170153e-06,
"loss": 0.1888,
"step": 119
},
{
"epoch": 0.17673048600883653,
"grad_norm": 2.862267267477572,
"learning_rate": 9.808569205376885e-06,
"loss": 0.1977,
"step": 120
},
{
"epoch": 0.17820324005891017,
"grad_norm": 3.1490402592480184,
"learning_rate": 9.80538634407485e-06,
"loss": 0.2022,
"step": 121
},
{
"epoch": 0.1796759941089838,
"grad_norm": 2.703112681626916,
"learning_rate": 9.802177765298091e-06,
"loss": 0.2172,
"step": 122
},
{
"epoch": 0.18114874815905743,
"grad_norm": 2.8427204160110517,
"learning_rate": 9.798943486218284e-06,
"loss": 0.1733,
"step": 123
},
{
"epoch": 0.18262150220913106,
"grad_norm": 3.2810082739191415,
"learning_rate": 9.795683524144649e-06,
"loss": 0.2001,
"step": 124
},
{
"epoch": 0.18409425625920472,
"grad_norm": 3.171988135686974,
"learning_rate": 9.792397896523857e-06,
"loss": 0.2304,
"step": 125
},
{
"epoch": 0.18556701030927836,
"grad_norm": 3.613990085160166,
"learning_rate": 9.789086620939936e-06,
"loss": 0.2841,
"step": 126
},
{
"epoch": 0.187039764359352,
"grad_norm": 3.436471960168319,
"learning_rate": 9.785749715114177e-06,
"loss": 0.2289,
"step": 127
},
{
"epoch": 0.18851251840942562,
"grad_norm": 2.66634702118146,
"learning_rate": 9.782387196905034e-06,
"loss": 0.1724,
"step": 128
},
{
"epoch": 0.18998527245949925,
"grad_norm": 3.1813963633826683,
"learning_rate": 9.778999084308043e-06,
"loss": 0.2295,
"step": 129
},
{
"epoch": 0.19145802650957292,
"grad_norm": 3.3659137081706367,
"learning_rate": 9.775585395455708e-06,
"loss": 0.2157,
"step": 130
},
{
"epoch": 0.19293078055964655,
"grad_norm": 3.4836322324211513,
"learning_rate": 9.772146148617414e-06,
"loss": 0.2214,
"step": 131
},
{
"epoch": 0.19440353460972018,
"grad_norm": 3.292266923964018,
"learning_rate": 9.76868136219933e-06,
"loss": 0.2089,
"step": 132
},
{
"epoch": 0.1958762886597938,
"grad_norm": 2.9068554190412046,
"learning_rate": 9.765191054744305e-06,
"loss": 0.1876,
"step": 133
},
{
"epoch": 0.19734904270986744,
"grad_norm": 2.777681310150956,
"learning_rate": 9.761675244931772e-06,
"loss": 0.1981,
"step": 134
},
{
"epoch": 0.19882179675994108,
"grad_norm": 3.391020475699768,
"learning_rate": 9.75813395157765e-06,
"loss": 0.2165,
"step": 135
},
{
"epoch": 0.20029455081001474,
"grad_norm": 3.1826923463080923,
"learning_rate": 9.754567193634232e-06,
"loss": 0.2107,
"step": 136
},
{
"epoch": 0.20176730486008837,
"grad_norm": 2.6839128866173563,
"learning_rate": 9.750974990190107e-06,
"loss": 0.2013,
"step": 137
},
{
"epoch": 0.203240058910162,
"grad_norm": 4.2008156135049814,
"learning_rate": 9.747357360470033e-06,
"loss": 0.2419,
"step": 138
},
{
"epoch": 0.20471281296023564,
"grad_norm": 4.027928958269575,
"learning_rate": 9.743714323834844e-06,
"loss": 0.2175,
"step": 139
},
{
"epoch": 0.20618556701030927,
"grad_norm": 3.115166800077054,
"learning_rate": 9.740045899781353e-06,
"loss": 0.221,
"step": 140
},
{
"epoch": 0.20765832106038293,
"grad_norm": 4.0874344767769974,
"learning_rate": 9.736352107942237e-06,
"loss": 0.2442,
"step": 141
},
{
"epoch": 0.20913107511045656,
"grad_norm": 3.565915108388009,
"learning_rate": 9.732632968085937e-06,
"loss": 0.1765,
"step": 142
},
{
"epoch": 0.2106038291605302,
"grad_norm": 2.565264174585471,
"learning_rate": 9.728888500116551e-06,
"loss": 0.1833,
"step": 143
},
{
"epoch": 0.21207658321060383,
"grad_norm": 3.2970463396097145,
"learning_rate": 9.725118724073732e-06,
"loss": 0.1681,
"step": 144
},
{
"epoch": 0.21354933726067746,
"grad_norm": 3.985651307847386,
"learning_rate": 9.721323660132572e-06,
"loss": 0.2852,
"step": 145
},
{
"epoch": 0.21502209131075112,
"grad_norm": 2.6390893883353037,
"learning_rate": 9.717503328603499e-06,
"loss": 0.2032,
"step": 146
},
{
"epoch": 0.21649484536082475,
"grad_norm": 2.8556278741462475,
"learning_rate": 9.713657749932172e-06,
"loss": 0.2305,
"step": 147
},
{
"epoch": 0.21796759941089838,
"grad_norm": 3.3789042172013737,
"learning_rate": 9.709786944699364e-06,
"loss": 0.2362,
"step": 148
},
{
"epoch": 0.21944035346097202,
"grad_norm": 3.5664134680045967,
"learning_rate": 9.705890933620859e-06,
"loss": 0.2556,
"step": 149
},
{
"epoch": 0.22091310751104565,
"grad_norm": 2.6115979526806226,
"learning_rate": 9.701969737547332e-06,
"loss": 0.2149,
"step": 150
},
{
"epoch": 0.22238586156111928,
"grad_norm": 3.199894028609453,
"learning_rate": 9.69802337746425e-06,
"loss": 0.1842,
"step": 151
},
{
"epoch": 0.22385861561119294,
"grad_norm": 2.9713680676755643,
"learning_rate": 9.694051874491748e-06,
"loss": 0.2037,
"step": 152
},
{
"epoch": 0.22533136966126657,
"grad_norm": 3.371703034689337,
"learning_rate": 9.690055249884524e-06,
"loss": 0.2674,
"step": 153
},
{
"epoch": 0.2268041237113402,
"grad_norm": 2.9536771126823784,
"learning_rate": 9.68603352503172e-06,
"loss": 0.2223,
"step": 154
},
{
"epoch": 0.22827687776141384,
"grad_norm": 2.7924914660332316,
"learning_rate": 9.681986721456806e-06,
"loss": 0.1998,
"step": 155
},
{
"epoch": 0.22974963181148747,
"grad_norm": 2.9038543623288446,
"learning_rate": 9.677914860817476e-06,
"loss": 0.2138,
"step": 156
},
{
"epoch": 0.23122238586156113,
"grad_norm": 3.2780366704955313,
"learning_rate": 9.67381796490552e-06,
"loss": 0.1881,
"step": 157
},
{
"epoch": 0.23269513991163476,
"grad_norm": 2.8392198609551795,
"learning_rate": 9.669696055646713e-06,
"loss": 0.1791,
"step": 158
},
{
"epoch": 0.2341678939617084,
"grad_norm": 2.519298030175099,
"learning_rate": 9.665549155100696e-06,
"loss": 0.1713,
"step": 159
},
{
"epoch": 0.23564064801178203,
"grad_norm": 2.944710796481666,
"learning_rate": 9.661377285460856e-06,
"loss": 0.2036,
"step": 160
},
{
"epoch": 0.23711340206185566,
"grad_norm": 3.7896683971691925,
"learning_rate": 9.657180469054213e-06,
"loss": 0.2603,
"step": 161
},
{
"epoch": 0.23858615611192932,
"grad_norm": 2.9596722312146784,
"learning_rate": 9.652958728341296e-06,
"loss": 0.1781,
"step": 162
},
{
"epoch": 0.24005891016200295,
"grad_norm": 3.214060280061799,
"learning_rate": 9.648712085916025e-06,
"loss": 0.1915,
"step": 163
},
{
"epoch": 0.24153166421207659,
"grad_norm": 3.187133469928152,
"learning_rate": 9.644440564505589e-06,
"loss": 0.1834,
"step": 164
},
{
"epoch": 0.24300441826215022,
"grad_norm": 3.681229252431609,
"learning_rate": 9.640144186970319e-06,
"loss": 0.2262,
"step": 165
},
{
"epoch": 0.24447717231222385,
"grad_norm": 3.2016122797845363,
"learning_rate": 9.635822976303582e-06,
"loss": 0.1706,
"step": 166
},
{
"epoch": 0.24594992636229748,
"grad_norm": 3.730689931811761,
"learning_rate": 9.631476955631636e-06,
"loss": 0.2484,
"step": 167
},
{
"epoch": 0.24742268041237114,
"grad_norm": 3.494482628690699,
"learning_rate": 9.627106148213521e-06,
"loss": 0.2417,
"step": 168
},
{
"epoch": 0.24889543446244478,
"grad_norm": 2.7046561661674176,
"learning_rate": 9.622710577440936e-06,
"loss": 0.1682,
"step": 169
},
{
"epoch": 0.2503681885125184,
"grad_norm": 3.5016680527035198,
"learning_rate": 9.6182902668381e-06,
"loss": 0.2391,
"step": 170
},
{
"epoch": 0.25184094256259204,
"grad_norm": 2.106385345129964,
"learning_rate": 9.613845240061642e-06,
"loss": 0.1651,
"step": 171
},
{
"epoch": 0.2533136966126657,
"grad_norm": 3.5843407537461576,
"learning_rate": 9.60937552090046e-06,
"loss": 0.2173,
"step": 172
},
{
"epoch": 0.2547864506627393,
"grad_norm": 2.6949160731602086,
"learning_rate": 9.604881133275606e-06,
"loss": 0.1804,
"step": 173
},
{
"epoch": 0.25625920471281294,
"grad_norm": 2.698160920515062,
"learning_rate": 9.600362101240153e-06,
"loss": 0.179,
"step": 174
},
{
"epoch": 0.25773195876288657,
"grad_norm": 2.8058773510654564,
"learning_rate": 9.595818448979061e-06,
"loss": 0.215,
"step": 175
},
{
"epoch": 0.25920471281296026,
"grad_norm": 2.645103914035817,
"learning_rate": 9.591250200809061e-06,
"loss": 0.167,
"step": 176
},
{
"epoch": 0.2606774668630339,
"grad_norm": 2.5985448999120946,
"learning_rate": 9.586657381178506e-06,
"loss": 0.174,
"step": 177
},
{
"epoch": 0.2621502209131075,
"grad_norm": 2.3143387309459844,
"learning_rate": 9.582040014667258e-06,
"loss": 0.1872,
"step": 178
},
{
"epoch": 0.26362297496318116,
"grad_norm": 3.438390359114222,
"learning_rate": 9.577398125986546e-06,
"loss": 0.2256,
"step": 179
},
{
"epoch": 0.2650957290132548,
"grad_norm": 3.0621291109544932,
"learning_rate": 9.57273173997884e-06,
"loss": 0.2013,
"step": 180
},
{
"epoch": 0.2665684830633284,
"grad_norm": 2.589989781920295,
"learning_rate": 9.56804088161771e-06,
"loss": 0.2082,
"step": 181
},
{
"epoch": 0.26804123711340205,
"grad_norm": 3.1290423588543517,
"learning_rate": 9.563325576007702e-06,
"loss": 0.2381,
"step": 182
},
{
"epoch": 0.2695139911634757,
"grad_norm": 2.5850784305510026,
"learning_rate": 9.558585848384194e-06,
"loss": 0.1492,
"step": 183
},
{
"epoch": 0.2709867452135493,
"grad_norm": 3.197169330104788,
"learning_rate": 9.553821724113268e-06,
"loss": 0.2513,
"step": 184
},
{
"epoch": 0.27245949926362295,
"grad_norm": 2.7955706520257455,
"learning_rate": 9.549033228691576e-06,
"loss": 0.1996,
"step": 185
},
{
"epoch": 0.27393225331369664,
"grad_norm": 2.7908291417631053,
"learning_rate": 9.544220387746193e-06,
"loss": 0.2033,
"step": 186
},
{
"epoch": 0.27540500736377027,
"grad_norm": 2.741273817653423,
"learning_rate": 9.539383227034489e-06,
"loss": 0.1994,
"step": 187
},
{
"epoch": 0.2768777614138439,
"grad_norm": 3.1305777720744628,
"learning_rate": 9.534521772443989e-06,
"loss": 0.1924,
"step": 188
},
{
"epoch": 0.27835051546391754,
"grad_norm": 3.7426036500166684,
"learning_rate": 9.529636049992235e-06,
"loss": 0.2504,
"step": 189
},
{
"epoch": 0.27982326951399117,
"grad_norm": 2.0789109823416196,
"learning_rate": 9.524726085826645e-06,
"loss": 0.1498,
"step": 190
},
{
"epoch": 0.2812960235640648,
"grad_norm": 2.0879395036515085,
"learning_rate": 9.519791906224372e-06,
"loss": 0.1742,
"step": 191
},
{
"epoch": 0.28276877761413843,
"grad_norm": 3.6332083539027122,
"learning_rate": 9.514833537592167e-06,
"loss": 0.2477,
"step": 192
},
{
"epoch": 0.28424153166421207,
"grad_norm": 3.4011295429243953,
"learning_rate": 9.509851006466235e-06,
"loss": 0.2181,
"step": 193
},
{
"epoch": 0.2857142857142857,
"grad_norm": 3.2524842525432214,
"learning_rate": 9.504844339512096e-06,
"loss": 0.1913,
"step": 194
},
{
"epoch": 0.28718703976435933,
"grad_norm": 3.2393209593702026,
"learning_rate": 9.499813563524439e-06,
"loss": 0.2564,
"step": 195
},
{
"epoch": 0.28865979381443296,
"grad_norm": 2.7259868505734883,
"learning_rate": 9.494758705426978e-06,
"loss": 0.2362,
"step": 196
},
{
"epoch": 0.29013254786450665,
"grad_norm": 3.2730541713875954,
"learning_rate": 9.48967979227231e-06,
"loss": 0.2708,
"step": 197
},
{
"epoch": 0.2916053019145803,
"grad_norm": 3.6942590779013837,
"learning_rate": 9.484576851241774e-06,
"loss": 0.2598,
"step": 198
},
{
"epoch": 0.2930780559646539,
"grad_norm": 3.4221617953713044,
"learning_rate": 9.479449909645296e-06,
"loss": 0.2186,
"step": 199
},
{
"epoch": 0.29455081001472755,
"grad_norm": 2.9048709806411153,
"learning_rate": 9.474298994921252e-06,
"loss": 0.1949,
"step": 200
},
{
"epoch": 0.2960235640648012,
"grad_norm": 2.7375080374295315,
"learning_rate": 9.469124134636317e-06,
"loss": 0.2169,
"step": 201
},
{
"epoch": 0.2974963181148748,
"grad_norm": 3.2624468680574483,
"learning_rate": 9.463925356485313e-06,
"loss": 0.2005,
"step": 202
},
{
"epoch": 0.29896907216494845,
"grad_norm": 3.357905644483348,
"learning_rate": 9.458702688291072e-06,
"loss": 0.221,
"step": 203
},
{
"epoch": 0.3004418262150221,
"grad_norm": 2.7139450967157686,
"learning_rate": 9.45345615800428e-06,
"loss": 0.2043,
"step": 204
},
{
"epoch": 0.3019145802650957,
"grad_norm": 2.1826977262886436,
"learning_rate": 9.448185793703325e-06,
"loss": 0.1382,
"step": 205
},
{
"epoch": 0.30338733431516934,
"grad_norm": 3.142821649729342,
"learning_rate": 9.442891623594153e-06,
"loss": 0.2361,
"step": 206
},
{
"epoch": 0.30486008836524303,
"grad_norm": 2.4836224577160446,
"learning_rate": 9.43757367601011e-06,
"loss": 0.2259,
"step": 207
},
{
"epoch": 0.30633284241531666,
"grad_norm": 2.775002424298524,
"learning_rate": 9.432231979411799e-06,
"loss": 0.1841,
"step": 208
},
{
"epoch": 0.3078055964653903,
"grad_norm": 3.0215163458905767,
"learning_rate": 9.426866562386919e-06,
"loss": 0.2017,
"step": 209
},
{
"epoch": 0.30927835051546393,
"grad_norm": 3.075012833268593,
"learning_rate": 9.421477453650118e-06,
"loss": 0.2564,
"step": 210
},
{
"epoch": 0.31075110456553756,
"grad_norm": 2.1033703952542298,
"learning_rate": 9.41606468204284e-06,
"loss": 0.148,
"step": 211
},
{
"epoch": 0.3122238586156112,
"grad_norm": 3.43331899798626,
"learning_rate": 9.410628276533163e-06,
"loss": 0.2211,
"step": 212
},
{
"epoch": 0.3136966126656848,
"grad_norm": 2.85824244857793,
"learning_rate": 9.40516826621565e-06,
"loss": 0.2331,
"step": 213
},
{
"epoch": 0.31516936671575846,
"grad_norm": 2.590034132267709,
"learning_rate": 9.399684680311197e-06,
"loss": 0.1818,
"step": 214
},
{
"epoch": 0.3166421207658321,
"grad_norm": 2.9268108365833183,
"learning_rate": 9.394177548166865e-06,
"loss": 0.2232,
"step": 215
},
{
"epoch": 0.3181148748159057,
"grad_norm": 2.506123086913095,
"learning_rate": 9.388646899255733e-06,
"loss": 0.163,
"step": 216
},
{
"epoch": 0.31958762886597936,
"grad_norm": 2.9103473622929688,
"learning_rate": 9.38309276317674e-06,
"loss": 0.1602,
"step": 217
},
{
"epoch": 0.32106038291605304,
"grad_norm": 3.3616843450895004,
"learning_rate": 9.377515169654518e-06,
"loss": 0.198,
"step": 218
},
{
"epoch": 0.3225331369661267,
"grad_norm": 2.832377450626078,
"learning_rate": 9.371914148539242e-06,
"loss": 0.1925,
"step": 219
},
{
"epoch": 0.3240058910162003,
"grad_norm": 2.430717934544939,
"learning_rate": 9.366289729806468e-06,
"loss": 0.1723,
"step": 220
},
{
"epoch": 0.32547864506627394,
"grad_norm": 2.8065095246039506,
"learning_rate": 9.36064194355697e-06,
"loss": 0.1835,
"step": 221
},
{
"epoch": 0.3269513991163476,
"grad_norm": 3.285856798309481,
"learning_rate": 9.354970820016576e-06,
"loss": 0.2428,
"step": 222
},
{
"epoch": 0.3284241531664212,
"grad_norm": 3.292186036973222,
"learning_rate": 9.349276389536017e-06,
"loss": 0.211,
"step": 223
},
{
"epoch": 0.32989690721649484,
"grad_norm": 3.0051382018089488,
"learning_rate": 9.343558682590757e-06,
"loss": 0.1826,
"step": 224
},
{
"epoch": 0.33136966126656847,
"grad_norm": 3.8927356666818795,
"learning_rate": 9.337817729780826e-06,
"loss": 0.244,
"step": 225
},
{
"epoch": 0.3328424153166421,
"grad_norm": 2.6379798191774175,
"learning_rate": 9.332053561830669e-06,
"loss": 0.1853,
"step": 226
},
{
"epoch": 0.33431516936671574,
"grad_norm": 3.6279970290742147,
"learning_rate": 9.326266209588966e-06,
"loss": 0.2242,
"step": 227
},
{
"epoch": 0.33578792341678937,
"grad_norm": 3.380122497482897,
"learning_rate": 9.320455704028482e-06,
"loss": 0.2192,
"step": 228
},
{
"epoch": 0.33726067746686306,
"grad_norm": 3.2219948474501305,
"learning_rate": 9.314622076245887e-06,
"loss": 0.2326,
"step": 229
},
{
"epoch": 0.3387334315169367,
"grad_norm": 3.0404471997377907,
"learning_rate": 9.308765357461604e-06,
"loss": 0.1872,
"step": 230
},
{
"epoch": 0.3402061855670103,
"grad_norm": 3.0157736275712406,
"learning_rate": 9.302885579019626e-06,
"loss": 0.2216,
"step": 231
},
{
"epoch": 0.34167893961708395,
"grad_norm": 2.716589993960879,
"learning_rate": 9.296982772387366e-06,
"loss": 0.2167,
"step": 232
},
{
"epoch": 0.3431516936671576,
"grad_norm": 3.061532521108386,
"learning_rate": 9.29105696915547e-06,
"loss": 0.2633,
"step": 233
},
{
"epoch": 0.3446244477172312,
"grad_norm": 3.1220356470518773,
"learning_rate": 9.285108201037663e-06,
"loss": 0.2128,
"step": 234
},
{
"epoch": 0.34609720176730485,
"grad_norm": 2.7156005801814964,
"learning_rate": 9.279136499870574e-06,
"loss": 0.1562,
"step": 235
},
{
"epoch": 0.3475699558173785,
"grad_norm": 2.5161864643115837,
"learning_rate": 9.27314189761356e-06,
"loss": 0.208,
"step": 236
},
{
"epoch": 0.3490427098674521,
"grad_norm": 2.81380932528038,
"learning_rate": 9.267124426348549e-06,
"loss": 0.206,
"step": 237
},
{
"epoch": 0.35051546391752575,
"grad_norm": 2.7182189987352454,
"learning_rate": 9.261084118279846e-06,
"loss": 0.1862,
"step": 238
},
{
"epoch": 0.35198821796759944,
"grad_norm": 3.051952841047558,
"learning_rate": 9.255021005733989e-06,
"loss": 0.2654,
"step": 239
},
{
"epoch": 0.35346097201767307,
"grad_norm": 3.2184875593429854,
"learning_rate": 9.248935121159552e-06,
"loss": 0.1956,
"step": 240
},
{
"epoch": 0.3549337260677467,
"grad_norm": 2.955006281633986,
"learning_rate": 9.24282649712698e-06,
"loss": 0.2764,
"step": 241
},
{
"epoch": 0.35640648011782033,
"grad_norm": 2.8157353776869534,
"learning_rate": 9.23669516632842e-06,
"loss": 0.1844,
"step": 242
},
{
"epoch": 0.35787923416789397,
"grad_norm": 2.1637523528646896,
"learning_rate": 9.230541161577535e-06,
"loss": 0.1573,
"step": 243
},
{
"epoch": 0.3593519882179676,
"grad_norm": 2.598383826057999,
"learning_rate": 9.224364515809344e-06,
"loss": 0.1814,
"step": 244
},
{
"epoch": 0.36082474226804123,
"grad_norm": 2.867443024017231,
"learning_rate": 9.218165262080024e-06,
"loss": 0.1963,
"step": 245
},
{
"epoch": 0.36229749631811486,
"grad_norm": 3.1621315329847683,
"learning_rate": 9.211943433566755e-06,
"loss": 0.2315,
"step": 246
},
{
"epoch": 0.3637702503681885,
"grad_norm": 2.7573640644608894,
"learning_rate": 9.205699063567528e-06,
"loss": 0.2127,
"step": 247
},
{
"epoch": 0.36524300441826213,
"grad_norm": 3.225352548953467,
"learning_rate": 9.199432185500972e-06,
"loss": 0.2467,
"step": 248
},
{
"epoch": 0.36671575846833576,
"grad_norm": 3.1363371958388484,
"learning_rate": 9.19314283290618e-06,
"loss": 0.2458,
"step": 249
},
{
"epoch": 0.36818851251840945,
"grad_norm": 2.704895240551704,
"learning_rate": 9.186831039442514e-06,
"loss": 0.1929,
"step": 250
},
{
"epoch": 0.3696612665684831,
"grad_norm": 3.4728725502728715,
"learning_rate": 9.180496838889446e-06,
"loss": 0.2527,
"step": 251
},
{
"epoch": 0.3711340206185567,
"grad_norm": 3.1470649429210926,
"learning_rate": 9.174140265146356e-06,
"loss": 0.2134,
"step": 252
},
{
"epoch": 0.37260677466863035,
"grad_norm": 3.28728525676738,
"learning_rate": 9.167761352232372e-06,
"loss": 0.2366,
"step": 253
},
{
"epoch": 0.374079528718704,
"grad_norm": 2.69310735559955,
"learning_rate": 9.161360134286166e-06,
"loss": 0.2066,
"step": 254
},
{
"epoch": 0.3755522827687776,
"grad_norm": 2.79351498094659,
"learning_rate": 9.154936645565788e-06,
"loss": 0.1749,
"step": 255
},
{
"epoch": 0.37702503681885124,
"grad_norm": 2.5160724004293806,
"learning_rate": 9.148490920448476e-06,
"loss": 0.2123,
"step": 256
},
{
"epoch": 0.3784977908689249,
"grad_norm": 2.6875394507380137,
"learning_rate": 9.142022993430475e-06,
"loss": 0.1964,
"step": 257
},
{
"epoch": 0.3799705449189985,
"grad_norm": 2.662291621699446,
"learning_rate": 9.135532899126844e-06,
"loss": 0.2049,
"step": 258
},
{
"epoch": 0.38144329896907214,
"grad_norm": 2.810464023609334,
"learning_rate": 9.129020672271283e-06,
"loss": 0.1889,
"step": 259
},
{
"epoch": 0.38291605301914583,
"grad_norm": 2.263563107768252,
"learning_rate": 9.122486347715937e-06,
"loss": 0.1627,
"step": 260
},
{
"epoch": 0.38438880706921946,
"grad_norm": 2.7689382781268033,
"learning_rate": 9.115929960431217e-06,
"loss": 0.2142,
"step": 261
},
{
"epoch": 0.3858615611192931,
"grad_norm": 3.524805623776474,
"learning_rate": 9.109351545505607e-06,
"loss": 0.3073,
"step": 262
},
{
"epoch": 0.3873343151693667,
"grad_norm": 2.9769637254549166,
"learning_rate": 9.10275113814548e-06,
"loss": 0.2254,
"step": 263
},
{
"epoch": 0.38880706921944036,
"grad_norm": 2.674094413353048,
"learning_rate": 9.096128773674902e-06,
"loss": 0.1779,
"step": 264
},
{
"epoch": 0.390279823269514,
"grad_norm": 3.1259498837476634,
"learning_rate": 9.08948448753546e-06,
"loss": 0.1791,
"step": 265
},
{
"epoch": 0.3917525773195876,
"grad_norm": 2.73101491316821,
"learning_rate": 9.082818315286054e-06,
"loss": 0.1905,
"step": 266
},
{
"epoch": 0.39322533136966126,
"grad_norm": 3.392701650405703,
"learning_rate": 9.076130292602717e-06,
"loss": 0.2023,
"step": 267
},
{
"epoch": 0.3946980854197349,
"grad_norm": 2.7586772830435424,
"learning_rate": 9.069420455278418e-06,
"loss": 0.2096,
"step": 268
},
{
"epoch": 0.3961708394698085,
"grad_norm": 2.6884641588849028,
"learning_rate": 9.062688839222878e-06,
"loss": 0.1844,
"step": 269
},
{
"epoch": 0.39764359351988215,
"grad_norm": 2.851457009221402,
"learning_rate": 9.055935480462366e-06,
"loss": 0.2127,
"step": 270
},
{
"epoch": 0.39911634756995584,
"grad_norm": 3.200602534780458,
"learning_rate": 9.049160415139525e-06,
"loss": 0.224,
"step": 271
},
{
"epoch": 0.4005891016200295,
"grad_norm": 2.9975515825389323,
"learning_rate": 9.042363679513158e-06,
"loss": 0.1574,
"step": 272
},
{
"epoch": 0.4020618556701031,
"grad_norm": 2.778235077186209,
"learning_rate": 9.035545309958048e-06,
"loss": 0.1813,
"step": 273
},
{
"epoch": 0.40353460972017674,
"grad_norm": 2.1611591561612986,
"learning_rate": 9.028705342964752e-06,
"loss": 0.1311,
"step": 274
},
{
"epoch": 0.4050073637702504,
"grad_norm": 3.1410123112072252,
"learning_rate": 9.021843815139424e-06,
"loss": 0.2018,
"step": 275
},
{
"epoch": 0.406480117820324,
"grad_norm": 2.653212394824665,
"learning_rate": 9.014960763203592e-06,
"loss": 0.1725,
"step": 276
},
{
"epoch": 0.40795287187039764,
"grad_norm": 2.5114048753988327,
"learning_rate": 9.008056223993993e-06,
"loss": 0.1587,
"step": 277
},
{
"epoch": 0.40942562592047127,
"grad_norm": 2.3310789133305034,
"learning_rate": 9.001130234462348e-06,
"loss": 0.1728,
"step": 278
},
{
"epoch": 0.4108983799705449,
"grad_norm": 3.0268391081998067,
"learning_rate": 8.994182831675176e-06,
"loss": 0.2069,
"step": 279
},
{
"epoch": 0.41237113402061853,
"grad_norm": 3.176729678522873,
"learning_rate": 8.987214052813605e-06,
"loss": 0.202,
"step": 280
},
{
"epoch": 0.41384388807069217,
"grad_norm": 2.900999234971349,
"learning_rate": 8.980223935173153e-06,
"loss": 0.2039,
"step": 281
},
{
"epoch": 0.41531664212076586,
"grad_norm": 2.958916451457122,
"learning_rate": 8.973212516163545e-06,
"loss": 0.223,
"step": 282
},
{
"epoch": 0.4167893961708395,
"grad_norm": 3.2062050255647616,
"learning_rate": 8.966179833308506e-06,
"loss": 0.2008,
"step": 283
},
{
"epoch": 0.4182621502209131,
"grad_norm": 2.3023016729588646,
"learning_rate": 8.959125924245559e-06,
"loss": 0.1724,
"step": 284
},
{
"epoch": 0.41973490427098675,
"grad_norm": 3.3857256850661788,
"learning_rate": 8.952050826725827e-06,
"loss": 0.208,
"step": 285
},
{
"epoch": 0.4212076583210604,
"grad_norm": 2.9980500030235273,
"learning_rate": 8.944954578613826e-06,
"loss": 0.1935,
"step": 286
},
{
"epoch": 0.422680412371134,
"grad_norm": 2.4335052728686826,
"learning_rate": 8.937837217887273e-06,
"loss": 0.1385,
"step": 287
},
{
"epoch": 0.42415316642120765,
"grad_norm": 2.7372999604088113,
"learning_rate": 8.930698782636868e-06,
"loss": 0.1982,
"step": 288
},
{
"epoch": 0.4256259204712813,
"grad_norm": 3.3460685534877888,
"learning_rate": 8.923539311066101e-06,
"loss": 0.2435,
"step": 289
},
{
"epoch": 0.4270986745213549,
"grad_norm": 3.073278464461925,
"learning_rate": 8.916358841491046e-06,
"loss": 0.2257,
"step": 290
},
{
"epoch": 0.42857142857142855,
"grad_norm": 3.033222631807686,
"learning_rate": 8.90915741234015e-06,
"loss": 0.1561,
"step": 291
},
{
"epoch": 0.43004418262150224,
"grad_norm": 2.9077878079684014,
"learning_rate": 8.901935062154035e-06,
"loss": 0.1688,
"step": 292
},
{
"epoch": 0.43151693667157587,
"grad_norm": 2.4746618411647914,
"learning_rate": 8.894691829585285e-06,
"loss": 0.1901,
"step": 293
},
{
"epoch": 0.4329896907216495,
"grad_norm": 2.664815899368701,
"learning_rate": 8.887427753398249e-06,
"loss": 0.1554,
"step": 294
},
{
"epoch": 0.43446244477172313,
"grad_norm": 3.0348226117764163,
"learning_rate": 8.880142872468816e-06,
"loss": 0.2212,
"step": 295
},
{
"epoch": 0.43593519882179677,
"grad_norm": 2.6085379817053425,
"learning_rate": 8.872837225784227e-06,
"loss": 0.1719,
"step": 296
},
{
"epoch": 0.4374079528718704,
"grad_norm": 3.2317544602456123,
"learning_rate": 8.865510852442854e-06,
"loss": 0.2083,
"step": 297
},
{
"epoch": 0.43888070692194403,
"grad_norm": 2.741617300142697,
"learning_rate": 8.858163791653994e-06,
"loss": 0.1938,
"step": 298
},
{
"epoch": 0.44035346097201766,
"grad_norm": 2.8748881811894993,
"learning_rate": 8.85079608273766e-06,
"loss": 0.1682,
"step": 299
},
{
"epoch": 0.4418262150220913,
"grad_norm": 3.0709289314464274,
"learning_rate": 8.84340776512437e-06,
"loss": 0.2368,
"step": 300
},
{
"epoch": 0.44329896907216493,
"grad_norm": 3.332976226058547,
"learning_rate": 8.83599887835493e-06,
"loss": 0.268,
"step": 301
},
{
"epoch": 0.44477172312223856,
"grad_norm": 3.2928336415795676,
"learning_rate": 8.82856946208024e-06,
"loss": 0.2426,
"step": 302
},
{
"epoch": 0.44624447717231225,
"grad_norm": 3.0918828260694644,
"learning_rate": 8.821119556061054e-06,
"loss": 0.2047,
"step": 303
},
{
"epoch": 0.4477172312223859,
"grad_norm": 2.6455271835816316,
"learning_rate": 8.8136492001678e-06,
"loss": 0.2242,
"step": 304
},
{
"epoch": 0.4491899852724595,
"grad_norm": 2.367999960305818,
"learning_rate": 8.806158434380334e-06,
"loss": 0.1498,
"step": 305
},
{
"epoch": 0.45066273932253315,
"grad_norm": 3.1564104973218474,
"learning_rate": 8.798647298787754e-06,
"loss": 0.2197,
"step": 306
},
{
"epoch": 0.4521354933726068,
"grad_norm": 2.7830911081835263,
"learning_rate": 8.791115833588165e-06,
"loss": 0.2114,
"step": 307
},
{
"epoch": 0.4536082474226804,
"grad_norm": 2.339228369282024,
"learning_rate": 8.783564079088478e-06,
"loss": 0.194,
"step": 308
},
{
"epoch": 0.45508100147275404,
"grad_norm": 3.191493300938616,
"learning_rate": 8.775992075704181e-06,
"loss": 0.203,
"step": 309
},
{
"epoch": 0.4565537555228277,
"grad_norm": 3.1503597728255475,
"learning_rate": 8.76839986395914e-06,
"loss": 0.1917,
"step": 310
},
{
"epoch": 0.4580265095729013,
"grad_norm": 2.5760890814393615,
"learning_rate": 8.760787484485362e-06,
"loss": 0.1699,
"step": 311
},
{
"epoch": 0.45949926362297494,
"grad_norm": 2.508820257174523,
"learning_rate": 8.753154978022795e-06,
"loss": 0.1699,
"step": 312
},
{
"epoch": 0.46097201767304863,
"grad_norm": 2.815681439486473,
"learning_rate": 8.7455023854191e-06,
"loss": 0.2175,
"step": 313
},
{
"epoch": 0.46244477172312226,
"grad_norm": 2.3474709844903843,
"learning_rate": 8.737829747629432e-06,
"loss": 0.1524,
"step": 314
},
{
"epoch": 0.4639175257731959,
"grad_norm": 2.6627577805253533,
"learning_rate": 8.730137105716231e-06,
"loss": 0.2368,
"step": 315
},
{
"epoch": 0.4653902798232695,
"grad_norm": 3.6483303897239963,
"learning_rate": 8.722424500848988e-06,
"loss": 0.1976,
"step": 316
},
{
"epoch": 0.46686303387334316,
"grad_norm": 2.935242259870657,
"learning_rate": 8.714691974304035e-06,
"loss": 0.2103,
"step": 317
},
{
"epoch": 0.4683357879234168,
"grad_norm": 2.8226822109484724,
"learning_rate": 8.706939567464322e-06,
"loss": 0.2348,
"step": 318
},
{
"epoch": 0.4698085419734904,
"grad_norm": 3.703059329119747,
"learning_rate": 8.69916732181919e-06,
"loss": 0.2174,
"step": 319
},
{
"epoch": 0.47128129602356406,
"grad_norm": 2.9480735349519307,
"learning_rate": 8.691375278964161e-06,
"loss": 0.2008,
"step": 320
},
{
"epoch": 0.4727540500736377,
"grad_norm": 3.091324162542489,
"learning_rate": 8.6835634806007e-06,
"loss": 0.2477,
"step": 321
},
{
"epoch": 0.4742268041237113,
"grad_norm": 3.5902022399029834,
"learning_rate": 8.675731968536004e-06,
"loss": 0.2442,
"step": 322
},
{
"epoch": 0.47569955817378495,
"grad_norm": 2.939683038888228,
"learning_rate": 8.66788078468277e-06,
"loss": 0.2198,
"step": 323
},
{
"epoch": 0.47717231222385864,
"grad_norm": 2.8812334528236376,
"learning_rate": 8.660009971058977e-06,
"loss": 0.1993,
"step": 324
},
{
"epoch": 0.4786450662739323,
"grad_norm": 3.6057168421211787,
"learning_rate": 8.652119569787663e-06,
"loss": 0.2258,
"step": 325
},
{
"epoch": 0.4801178203240059,
"grad_norm": 3.597288920583859,
"learning_rate": 8.644209623096686e-06,
"loss": 0.2545,
"step": 326
},
{
"epoch": 0.48159057437407954,
"grad_norm": 3.4390171712068756,
"learning_rate": 8.636280173318517e-06,
"loss": 0.1918,
"step": 327
},
{
"epoch": 0.48306332842415317,
"grad_norm": 3.032127662830995,
"learning_rate": 8.628331262889992e-06,
"loss": 0.2046,
"step": 328
},
{
"epoch": 0.4845360824742268,
"grad_norm": 2.676901174668902,
"learning_rate": 8.620362934352109e-06,
"loss": 0.1815,
"step": 329
},
{
"epoch": 0.48600883652430044,
"grad_norm": 3.830805699602978,
"learning_rate": 8.612375230349779e-06,
"loss": 0.2411,
"step": 330
},
{
"epoch": 0.48748159057437407,
"grad_norm": 2.4906900864812305,
"learning_rate": 8.60436819363161e-06,
"loss": 0.1684,
"step": 331
},
{
"epoch": 0.4889543446244477,
"grad_norm": 2.4444113482232126,
"learning_rate": 8.596341867049677e-06,
"loss": 0.1467,
"step": 332
},
{
"epoch": 0.49042709867452133,
"grad_norm": 3.4954288127618365,
"learning_rate": 8.588296293559286e-06,
"loss": 0.2419,
"step": 333
},
{
"epoch": 0.49189985272459497,
"grad_norm": 2.5438367926199135,
"learning_rate": 8.58023151621875e-06,
"loss": 0.1703,
"step": 334
},
{
"epoch": 0.49337260677466865,
"grad_norm": 3.038027697084942,
"learning_rate": 8.57214757818916e-06,
"loss": 0.228,
"step": 335
},
{
"epoch": 0.4948453608247423,
"grad_norm": 3.7548333153148867,
"learning_rate": 8.564044522734147e-06,
"loss": 0.257,
"step": 336
},
{
"epoch": 0.4963181148748159,
"grad_norm": 2.7157802194600382,
"learning_rate": 8.55592239321966e-06,
"loss": 0.1518,
"step": 337
},
{
"epoch": 0.49779086892488955,
"grad_norm": 2.19265493343489,
"learning_rate": 8.54778123311372e-06,
"loss": 0.1607,
"step": 338
},
{
"epoch": 0.4992636229749632,
"grad_norm": 2.6816534246794634,
"learning_rate": 8.539621085986209e-06,
"loss": 0.1868,
"step": 339
},
{
"epoch": 0.5007363770250368,
"grad_norm": 3.87477423163916,
"learning_rate": 8.531441995508609e-06,
"loss": 0.2445,
"step": 340
},
{
"epoch": 0.5022091310751104,
"grad_norm": 3.1386210194757274,
"learning_rate": 8.523244005453795e-06,
"loss": 0.2235,
"step": 341
},
{
"epoch": 0.5036818851251841,
"grad_norm": 3.0575398322579868,
"learning_rate": 8.515027159695781e-06,
"loss": 0.195,
"step": 342
},
{
"epoch": 0.5051546391752577,
"grad_norm": 3.259828431228313,
"learning_rate": 8.506791502209497e-06,
"loss": 0.1942,
"step": 343
},
{
"epoch": 0.5066273932253313,
"grad_norm": 2.8199139166564,
"learning_rate": 8.498537077070548e-06,
"loss": 0.1506,
"step": 344
},
{
"epoch": 0.508100147275405,
"grad_norm": 3.0673180182256643,
"learning_rate": 8.490263928454983e-06,
"loss": 0.1812,
"step": 345
},
{
"epoch": 0.5095729013254786,
"grad_norm": 2.7003191008577185,
"learning_rate": 8.481972100639049e-06,
"loss": 0.1724,
"step": 346
},
{
"epoch": 0.5110456553755522,
"grad_norm": 3.3788019563020666,
"learning_rate": 8.473661637998966e-06,
"loss": 0.1898,
"step": 347
},
{
"epoch": 0.5125184094256259,
"grad_norm": 2.2754214416579783,
"learning_rate": 8.465332585010682e-06,
"loss": 0.1631,
"step": 348
},
{
"epoch": 0.5139911634756995,
"grad_norm": 2.578239855031428,
"learning_rate": 8.456984986249636e-06,
"loss": 0.1824,
"step": 349
},
{
"epoch": 0.5154639175257731,
"grad_norm": 2.5873653173219266,
"learning_rate": 8.448618886390523e-06,
"loss": 0.203,
"step": 350
},
{
"epoch": 0.5169366715758469,
"grad_norm": 3.945703180252543,
"learning_rate": 8.440234330207047e-06,
"loss": 0.1796,
"step": 351
},
{
"epoch": 0.5184094256259205,
"grad_norm": 3.3592392251092846,
"learning_rate": 8.431831362571692e-06,
"loss": 0.2058,
"step": 352
},
{
"epoch": 0.5198821796759941,
"grad_norm": 3.283241191967628,
"learning_rate": 8.423410028455474e-06,
"loss": 0.2041,
"step": 353
},
{
"epoch": 0.5213549337260678,
"grad_norm": 2.5399001423784227,
"learning_rate": 8.414970372927705e-06,
"loss": 0.1664,
"step": 354
},
{
"epoch": 0.5228276877761414,
"grad_norm": 3.338954536109113,
"learning_rate": 8.406512441155746e-06,
"loss": 0.2712,
"step": 355
},
{
"epoch": 0.524300441826215,
"grad_norm": 2.5050371298566056,
"learning_rate": 8.398036278404768e-06,
"loss": 0.1839,
"step": 356
},
{
"epoch": 0.5257731958762887,
"grad_norm": 2.1797030450497594,
"learning_rate": 8.389541930037516e-06,
"loss": 0.1578,
"step": 357
},
{
"epoch": 0.5272459499263623,
"grad_norm": 2.479492098411274,
"learning_rate": 8.38102944151406e-06,
"loss": 0.1836,
"step": 358
},
{
"epoch": 0.5287187039764359,
"grad_norm": 3.2045132083900225,
"learning_rate": 8.372498858391545e-06,
"loss": 0.2813,
"step": 359
},
{
"epoch": 0.5301914580265096,
"grad_norm": 2.6722264401815323,
"learning_rate": 8.363950226323963e-06,
"loss": 0.1736,
"step": 360
},
{
"epoch": 0.5316642120765832,
"grad_norm": 2.69073149052516,
"learning_rate": 8.355383591061898e-06,
"loss": 0.1817,
"step": 361
},
{
"epoch": 0.5331369661266568,
"grad_norm": 3.065347755873716,
"learning_rate": 8.346798998452283e-06,
"loss": 0.1754,
"step": 362
},
{
"epoch": 0.5346097201767305,
"grad_norm": 3.0548564778815,
"learning_rate": 8.338196494438153e-06,
"loss": 0.2002,
"step": 363
},
{
"epoch": 0.5360824742268041,
"grad_norm": 2.9790505648872436,
"learning_rate": 8.329576125058406e-06,
"loss": 0.2348,
"step": 364
},
{
"epoch": 0.5375552282768777,
"grad_norm": 4.563506545157317,
"learning_rate": 8.320937936447549e-06,
"loss": 0.2441,
"step": 365
},
{
"epoch": 0.5390279823269514,
"grad_norm": 3.37684954182675,
"learning_rate": 8.312281974835452e-06,
"loss": 0.2578,
"step": 366
},
{
"epoch": 0.540500736377025,
"grad_norm": 3.3361858569689415,
"learning_rate": 8.303608286547109e-06,
"loss": 0.2456,
"step": 367
},
{
"epoch": 0.5419734904270986,
"grad_norm": 2.3890129851813424,
"learning_rate": 8.294916918002377e-06,
"loss": 0.1602,
"step": 368
},
{
"epoch": 0.5434462444771723,
"grad_norm": 3.0258627442553983,
"learning_rate": 8.286207915715733e-06,
"loss": 0.2154,
"step": 369
},
{
"epoch": 0.5449189985272459,
"grad_norm": 2.9412210960720238,
"learning_rate": 8.277481326296039e-06,
"loss": 0.2335,
"step": 370
},
{
"epoch": 0.5463917525773195,
"grad_norm": 3.602454180004041,
"learning_rate": 8.268737196446264e-06,
"loss": 0.2525,
"step": 371
},
{
"epoch": 0.5478645066273933,
"grad_norm": 2.9183607705714922,
"learning_rate": 8.259975572963257e-06,
"loss": 0.2667,
"step": 372
},
{
"epoch": 0.5493372606774669,
"grad_norm": 2.6116150881709195,
"learning_rate": 8.251196502737496e-06,
"loss": 0.2216,
"step": 373
},
{
"epoch": 0.5508100147275405,
"grad_norm": 3.3432845876969988,
"learning_rate": 8.242400032752813e-06,
"loss": 0.2595,
"step": 374
},
{
"epoch": 0.5522827687776142,
"grad_norm": 2.583209817811413,
"learning_rate": 8.233586210086182e-06,
"loss": 0.1627,
"step": 375
},
{
"epoch": 0.5537555228276878,
"grad_norm": 3.234454619219617,
"learning_rate": 8.224755081907427e-06,
"loss": 0.2192,
"step": 376
},
{
"epoch": 0.5552282768777614,
"grad_norm": 3.314191004035807,
"learning_rate": 8.215906695478997e-06,
"loss": 0.1975,
"step": 377
},
{
"epoch": 0.5567010309278351,
"grad_norm": 3.749827270518619,
"learning_rate": 8.207041098155701e-06,
"loss": 0.2513,
"step": 378
},
{
"epoch": 0.5581737849779087,
"grad_norm": 2.7173241405482083,
"learning_rate": 8.198158337384457e-06,
"loss": 0.1781,
"step": 379
},
{
"epoch": 0.5596465390279823,
"grad_norm": 2.481104647636881,
"learning_rate": 8.189258460704039e-06,
"loss": 0.1881,
"step": 380
},
{
"epoch": 0.561119293078056,
"grad_norm": 2.709815508885945,
"learning_rate": 8.180341515744823e-06,
"loss": 0.2051,
"step": 381
},
{
"epoch": 0.5625920471281296,
"grad_norm": 2.3543635541046917,
"learning_rate": 8.171407550228532e-06,
"loss": 0.1798,
"step": 382
},
{
"epoch": 0.5640648011782032,
"grad_norm": 2.681937385638091,
"learning_rate": 8.162456611967972e-06,
"loss": 0.1937,
"step": 383
},
{
"epoch": 0.5655375552282769,
"grad_norm": 3.0109379848386633,
"learning_rate": 8.153488748866795e-06,
"loss": 0.2047,
"step": 384
},
{
"epoch": 0.5670103092783505,
"grad_norm": 3.2994675192690464,
"learning_rate": 8.144504008919224e-06,
"loss": 0.1708,
"step": 385
},
{
"epoch": 0.5684830633284241,
"grad_norm": 2.63707536509823,
"learning_rate": 8.135502440209803e-06,
"loss": 0.1706,
"step": 386
},
{
"epoch": 0.5699558173784978,
"grad_norm": 2.5778988818118593,
"learning_rate": 8.126484090913148e-06,
"loss": 0.1613,
"step": 387
},
{
"epoch": 0.5714285714285714,
"grad_norm": 2.9880786102250383,
"learning_rate": 8.117449009293668e-06,
"loss": 0.2124,
"step": 388
},
{
"epoch": 0.572901325478645,
"grad_norm": 2.7339186904426422,
"learning_rate": 8.108397243705335e-06,
"loss": 0.1767,
"step": 389
},
{
"epoch": 0.5743740795287187,
"grad_norm": 2.920927383070506,
"learning_rate": 8.0993288425914e-06,
"loss": 0.1948,
"step": 390
},
{
"epoch": 0.5758468335787923,
"grad_norm": 2.9959486658471417,
"learning_rate": 8.09024385448415e-06,
"loss": 0.1987,
"step": 391
},
{
"epoch": 0.5773195876288659,
"grad_norm": 2.6078343603636114,
"learning_rate": 8.081142328004638e-06,
"loss": 0.1777,
"step": 392
},
{
"epoch": 0.5787923416789397,
"grad_norm": 2.5604795545389205,
"learning_rate": 8.072024311862426e-06,
"loss": 0.1508,
"step": 393
},
{
"epoch": 0.5802650957290133,
"grad_norm": 3.648602445911234,
"learning_rate": 8.062889854855334e-06,
"loss": 0.2459,
"step": 394
},
{
"epoch": 0.5817378497790869,
"grad_norm": 2.677809870070787,
"learning_rate": 8.053739005869158e-06,
"loss": 0.1722,
"step": 395
},
{
"epoch": 0.5832106038291606,
"grad_norm": 3.3192707460526325,
"learning_rate": 8.044571813877431e-06,
"loss": 0.2267,
"step": 396
},
{
"epoch": 0.5846833578792342,
"grad_norm": 2.5799305333056837,
"learning_rate": 8.035388327941147e-06,
"loss": 0.1277,
"step": 397
},
{
"epoch": 0.5861561119293078,
"grad_norm": 3.117270413994588,
"learning_rate": 8.0261885972085e-06,
"loss": 0.2293,
"step": 398
},
{
"epoch": 0.5876288659793815,
"grad_norm": 2.9003896598861867,
"learning_rate": 8.016972670914624e-06,
"loss": 0.1986,
"step": 399
},
{
"epoch": 0.5891016200294551,
"grad_norm": 2.9415851694090525,
"learning_rate": 8.007740598381329e-06,
"loss": 0.2323,
"step": 400
},
{
"epoch": 0.5905743740795287,
"grad_norm": 2.5157737935406477,
"learning_rate": 7.998492429016837e-06,
"loss": 0.1968,
"step": 401
},
{
"epoch": 0.5920471281296024,
"grad_norm": 3.499617736336555,
"learning_rate": 7.989228212315516e-06,
"loss": 0.2127,
"step": 402
},
{
"epoch": 0.593519882179676,
"grad_norm": 3.174124598925698,
"learning_rate": 7.979947997857617e-06,
"loss": 0.219,
"step": 403
},
{
"epoch": 0.5949926362297496,
"grad_norm": 2.5812414659778278,
"learning_rate": 7.970651835309009e-06,
"loss": 0.1796,
"step": 404
},
{
"epoch": 0.5964653902798233,
"grad_norm": 2.4138046810747236,
"learning_rate": 7.961339774420907e-06,
"loss": 0.1878,
"step": 405
},
{
"epoch": 0.5979381443298969,
"grad_norm": 2.8540565839576555,
"learning_rate": 7.952011865029614e-06,
"loss": 0.1945,
"step": 406
},
{
"epoch": 0.5994108983799705,
"grad_norm": 2.531347080366607,
"learning_rate": 7.942668157056255e-06,
"loss": 0.1378,
"step": 407
},
{
"epoch": 0.6008836524300442,
"grad_norm": 3.154901934995226,
"learning_rate": 7.933308700506497e-06,
"loss": 0.1616,
"step": 408
},
{
"epoch": 0.6023564064801178,
"grad_norm": 3.053713830746646,
"learning_rate": 7.923933545470301e-06,
"loss": 0.2038,
"step": 409
},
{
"epoch": 0.6038291605301914,
"grad_norm": 3.5319559062347174,
"learning_rate": 7.914542742121632e-06,
"loss": 0.2189,
"step": 410
},
{
"epoch": 0.605301914580265,
"grad_norm": 2.929920661597212,
"learning_rate": 7.905136340718212e-06,
"loss": 0.1999,
"step": 411
},
{
"epoch": 0.6067746686303387,
"grad_norm": 3.20901586200067,
"learning_rate": 7.895714391601232e-06,
"loss": 0.2332,
"step": 412
},
{
"epoch": 0.6082474226804123,
"grad_norm": 2.7241977220902927,
"learning_rate": 7.886276945195098e-06,
"loss": 0.1715,
"step": 413
},
{
"epoch": 0.6097201767304861,
"grad_norm": 2.9678480126718476,
"learning_rate": 7.87682405200715e-06,
"loss": 0.1969,
"step": 414
},
{
"epoch": 0.6111929307805597,
"grad_norm": 3.0479292039013064,
"learning_rate": 7.867355762627397e-06,
"loss": 0.1538,
"step": 415
},
{
"epoch": 0.6126656848306333,
"grad_norm": 2.3888661846984247,
"learning_rate": 7.857872127728248e-06,
"loss": 0.1739,
"step": 416
},
{
"epoch": 0.614138438880707,
"grad_norm": 3.346527102339922,
"learning_rate": 7.848373198064237e-06,
"loss": 0.208,
"step": 417
},
{
"epoch": 0.6156111929307806,
"grad_norm": 2.813215877351235,
"learning_rate": 7.838859024471747e-06,
"loss": 0.1854,
"step": 418
},
{
"epoch": 0.6170839469808542,
"grad_norm": 3.009571356055603,
"learning_rate": 7.829329657868753e-06,
"loss": 0.1925,
"step": 419
},
{
"epoch": 0.6185567010309279,
"grad_norm": 2.4693834346638193,
"learning_rate": 7.819785149254534e-06,
"loss": 0.1482,
"step": 420
},
{
"epoch": 0.6200294550810015,
"grad_norm": 2.823982785483761,
"learning_rate": 7.810225549709404e-06,
"loss": 0.1986,
"step": 421
},
{
"epoch": 0.6215022091310751,
"grad_norm": 2.964260763763868,
"learning_rate": 7.80065091039445e-06,
"loss": 0.2132,
"step": 422
},
{
"epoch": 0.6229749631811488,
"grad_norm": 3.0959584354379572,
"learning_rate": 7.791061282551237e-06,
"loss": 0.2012,
"step": 423
},
{
"epoch": 0.6244477172312224,
"grad_norm": 2.671725358882168,
"learning_rate": 7.781456717501557e-06,
"loss": 0.1887,
"step": 424
},
{
"epoch": 0.625920471281296,
"grad_norm": 3.5658509460222985,
"learning_rate": 7.77183726664713e-06,
"loss": 0.2818,
"step": 425
},
{
"epoch": 0.6273932253313697,
"grad_norm": 3.1709737116220174,
"learning_rate": 7.762202981469358e-06,
"loss": 0.2644,
"step": 426
},
{
"epoch": 0.6288659793814433,
"grad_norm": 2.429853910460586,
"learning_rate": 7.752553913529019e-06,
"loss": 0.17,
"step": 427
},
{
"epoch": 0.6303387334315169,
"grad_norm": 3.1028664137976256,
"learning_rate": 7.74289011446601e-06,
"loss": 0.2063,
"step": 428
},
{
"epoch": 0.6318114874815906,
"grad_norm": 2.71916362952065,
"learning_rate": 7.733211635999072e-06,
"loss": 0.1695,
"step": 429
},
{
"epoch": 0.6332842415316642,
"grad_norm": 2.81679325242666,
"learning_rate": 7.7235185299255e-06,
"loss": 0.1831,
"step": 430
},
{
"epoch": 0.6347569955817378,
"grad_norm": 2.9587096819378056,
"learning_rate": 7.713810848120873e-06,
"loss": 0.189,
"step": 431
},
{
"epoch": 0.6362297496318114,
"grad_norm": 3.259400089947752,
"learning_rate": 7.704088642538782e-06,
"loss": 0.2296,
"step": 432
},
{
"epoch": 0.6377025036818851,
"grad_norm": 2.5365578312254518,
"learning_rate": 7.694351965210543e-06,
"loss": 0.2145,
"step": 433
},
{
"epoch": 0.6391752577319587,
"grad_norm": 2.8276495805494792,
"learning_rate": 7.68460086824492e-06,
"loss": 0.1797,
"step": 434
},
{
"epoch": 0.6406480117820325,
"grad_norm": 2.736677687473085,
"learning_rate": 7.674835403827852e-06,
"loss": 0.2262,
"step": 435
},
{
"epoch": 0.6421207658321061,
"grad_norm": 3.070894838978635,
"learning_rate": 7.665055624222166e-06,
"loss": 0.2135,
"step": 436
},
{
"epoch": 0.6435935198821797,
"grad_norm": 2.816220291246076,
"learning_rate": 7.655261581767306e-06,
"loss": 0.2038,
"step": 437
},
{
"epoch": 0.6450662739322534,
"grad_norm": 2.5905807406641603,
"learning_rate": 7.645453328879042e-06,
"loss": 0.1455,
"step": 438
},
{
"epoch": 0.646539027982327,
"grad_norm": 2.1365012260561858,
"learning_rate": 7.635630918049202e-06,
"loss": 0.1585,
"step": 439
},
{
"epoch": 0.6480117820324006,
"grad_norm": 2.402228516657614,
"learning_rate": 7.625794401845376e-06,
"loss": 0.176,
"step": 440
},
{
"epoch": 0.6494845360824743,
"grad_norm": 2.6298366040596384,
"learning_rate": 7.61594383291065e-06,
"loss": 0.1691,
"step": 441
},
{
"epoch": 0.6509572901325479,
"grad_norm": 2.7053102309257557,
"learning_rate": 7.606079263963318e-06,
"loss": 0.1709,
"step": 442
},
{
"epoch": 0.6524300441826215,
"grad_norm": 2.6618212391915006,
"learning_rate": 7.5962007477965935e-06,
"loss": 0.1815,
"step": 443
},
{
"epoch": 0.6539027982326951,
"grad_norm": 3.0920433094488877,
"learning_rate": 7.5863083372783365e-06,
"loss": 0.208,
"step": 444
},
{
"epoch": 0.6553755522827688,
"grad_norm": 2.621502736300709,
"learning_rate": 7.576402085350765e-06,
"loss": 0.1707,
"step": 445
},
{
"epoch": 0.6568483063328424,
"grad_norm": 2.8975906494183543,
"learning_rate": 7.566482045030179e-06,
"loss": 0.1618,
"step": 446
},
{
"epoch": 0.658321060382916,
"grad_norm": 2.3152886400243133,
"learning_rate": 7.556548269406663e-06,
"loss": 0.1763,
"step": 447
},
{
"epoch": 0.6597938144329897,
"grad_norm": 3.475707187942825,
"learning_rate": 7.546600811643816e-06,
"loss": 0.2268,
"step": 448
},
{
"epoch": 0.6612665684830633,
"grad_norm": 2.5575251767965503,
"learning_rate": 7.536639724978458e-06,
"loss": 0.1678,
"step": 449
},
{
"epoch": 0.6627393225331369,
"grad_norm": 3.1984667154519957,
"learning_rate": 7.526665062720351e-06,
"loss": 0.2474,
"step": 450
},
{
"epoch": 0.6642120765832106,
"grad_norm": 3.4829489314963737,
"learning_rate": 7.516676878251907e-06,
"loss": 0.2525,
"step": 451
},
{
"epoch": 0.6656848306332842,
"grad_norm": 2.597566480163576,
"learning_rate": 7.5066752250279104e-06,
"loss": 0.1661,
"step": 452
},
{
"epoch": 0.6671575846833578,
"grad_norm": 2.6300515588665396,
"learning_rate": 7.4966601565752265e-06,
"loss": 0.1767,
"step": 453
},
{
"epoch": 0.6686303387334315,
"grad_norm": 2.655827217691609,
"learning_rate": 7.486631726492511e-06,
"loss": 0.1936,
"step": 454
},
{
"epoch": 0.6701030927835051,
"grad_norm": 2.9626391882510945,
"learning_rate": 7.476589988449939e-06,
"loss": 0.1721,
"step": 455
},
{
"epoch": 0.6715758468335787,
"grad_norm": 3.098417760868542,
"learning_rate": 7.466534996188897e-06,
"loss": 0.1812,
"step": 456
},
{
"epoch": 0.6730486008836525,
"grad_norm": 2.8057882042803253,
"learning_rate": 7.45646680352171e-06,
"loss": 0.1899,
"step": 457
},
{
"epoch": 0.6745213549337261,
"grad_norm": 2.580716315482495,
"learning_rate": 7.446385464331349e-06,
"loss": 0.1647,
"step": 458
},
{
"epoch": 0.6759941089837997,
"grad_norm": 3.069780114418598,
"learning_rate": 7.436291032571142e-06,
"loss": 0.2297,
"step": 459
},
{
"epoch": 0.6774668630338734,
"grad_norm": 3.1726128225437473,
"learning_rate": 7.426183562264487e-06,
"loss": 0.1997,
"step": 460
},
{
"epoch": 0.678939617083947,
"grad_norm": 3.9757025531128583,
"learning_rate": 7.41606310750456e-06,
"loss": 0.2201,
"step": 461
},
{
"epoch": 0.6804123711340206,
"grad_norm": 2.858930358294962,
"learning_rate": 7.405929722454026e-06,
"loss": 0.1229,
"step": 462
},
{
"epoch": 0.6818851251840943,
"grad_norm": 3.7849179138321336,
"learning_rate": 7.395783461344755e-06,
"loss": 0.1913,
"step": 463
},
{
"epoch": 0.6833578792341679,
"grad_norm": 3.1039636619579865,
"learning_rate": 7.385624378477521e-06,
"loss": 0.193,
"step": 464
},
{
"epoch": 0.6848306332842415,
"grad_norm": 2.5798742938215415,
"learning_rate": 7.375452528221722e-06,
"loss": 0.1894,
"step": 465
},
{
"epoch": 0.6863033873343152,
"grad_norm": 2.903615315853614,
"learning_rate": 7.365267965015086e-06,
"loss": 0.2063,
"step": 466
},
{
"epoch": 0.6877761413843888,
"grad_norm": 3.568623911882026,
"learning_rate": 7.355070743363374e-06,
"loss": 0.2458,
"step": 467
},
{
"epoch": 0.6892488954344624,
"grad_norm": 3.3873971159699297,
"learning_rate": 7.344860917840092e-06,
"loss": 0.1918,
"step": 468
},
{
"epoch": 0.6907216494845361,
"grad_norm": 2.76053873930466,
"learning_rate": 7.334638543086203e-06,
"loss": 0.1664,
"step": 469
},
{
"epoch": 0.6921944035346097,
"grad_norm": 2.912498192150158,
"learning_rate": 7.324403673809831e-06,
"loss": 0.1596,
"step": 470
},
{
"epoch": 0.6936671575846833,
"grad_norm": 2.624045527481493,
"learning_rate": 7.314156364785963e-06,
"loss": 0.1798,
"step": 471
},
{
"epoch": 0.695139911634757,
"grad_norm": 2.859200421062988,
"learning_rate": 7.303896670856168e-06,
"loss": 0.195,
"step": 472
},
{
"epoch": 0.6966126656848306,
"grad_norm": 2.3406721127592562,
"learning_rate": 7.29362464692829e-06,
"loss": 0.1871,
"step": 473
},
{
"epoch": 0.6980854197349042,
"grad_norm": 3.004611054015597,
"learning_rate": 7.283340347976167e-06,
"loss": 0.252,
"step": 474
},
{
"epoch": 0.6995581737849779,
"grad_norm": 2.8720492409213607,
"learning_rate": 7.273043829039325e-06,
"loss": 0.1985,
"step": 475
},
{
"epoch": 0.7010309278350515,
"grad_norm": 2.582611133201151,
"learning_rate": 7.262735145222696e-06,
"loss": 0.1725,
"step": 476
},
{
"epoch": 0.7025036818851251,
"grad_norm": 3.0773343754931703,
"learning_rate": 7.252414351696305e-06,
"loss": 0.1922,
"step": 477
},
{
"epoch": 0.7039764359351989,
"grad_norm": 3.1150044677015964,
"learning_rate": 7.242081503694996e-06,
"loss": 0.2428,
"step": 478
},
{
"epoch": 0.7054491899852725,
"grad_norm": 2.63019918456231,
"learning_rate": 7.2317366565181204e-06,
"loss": 0.1867,
"step": 479
},
{
"epoch": 0.7069219440353461,
"grad_norm": 2.7112622421991457,
"learning_rate": 7.221379865529251e-06,
"loss": 0.1657,
"step": 480
},
{
"epoch": 0.7083946980854198,
"grad_norm": 3.0927886687064126,
"learning_rate": 7.211011186155878e-06,
"loss": 0.2141,
"step": 481
},
{
"epoch": 0.7098674521354934,
"grad_norm": 2.8647429677291547,
"learning_rate": 7.200630673889118e-06,
"loss": 0.2296,
"step": 482
},
{
"epoch": 0.711340206185567,
"grad_norm": 3.086955810955279,
"learning_rate": 7.190238384283413e-06,
"loss": 0.1752,
"step": 483
},
{
"epoch": 0.7128129602356407,
"grad_norm": 2.1480520026160828,
"learning_rate": 7.179834372956236e-06,
"loss": 0.1512,
"step": 484
},
{
"epoch": 0.7142857142857143,
"grad_norm": 2.526700952050052,
"learning_rate": 7.169418695587791e-06,
"loss": 0.185,
"step": 485
},
{
"epoch": 0.7157584683357879,
"grad_norm": 2.669035941510801,
"learning_rate": 7.158991407920721e-06,
"loss": 0.1995,
"step": 486
},
{
"epoch": 0.7172312223858616,
"grad_norm": 2.987668157649501,
"learning_rate": 7.1485525657598e-06,
"loss": 0.2005,
"step": 487
},
{
"epoch": 0.7187039764359352,
"grad_norm": 2.77911904744156,
"learning_rate": 7.13810222497164e-06,
"loss": 0.1804,
"step": 488
},
{
"epoch": 0.7201767304860088,
"grad_norm": 2.952831553303968,
"learning_rate": 7.127640441484393e-06,
"loss": 0.2209,
"step": 489
},
{
"epoch": 0.7216494845360825,
"grad_norm": 2.8357399012012396,
"learning_rate": 7.117167271287453e-06,
"loss": 0.2327,
"step": 490
},
{
"epoch": 0.7231222385861561,
"grad_norm": 2.69688716055132,
"learning_rate": 7.106682770431144e-06,
"loss": 0.1771,
"step": 491
},
{
"epoch": 0.7245949926362297,
"grad_norm": 2.5914437926095886,
"learning_rate": 7.096186995026439e-06,
"loss": 0.1517,
"step": 492
},
{
"epoch": 0.7260677466863034,
"grad_norm": 2.595142151080367,
"learning_rate": 7.085680001244644e-06,
"loss": 0.1847,
"step": 493
},
{
"epoch": 0.727540500736377,
"grad_norm": 2.703732109079409,
"learning_rate": 7.07516184531711e-06,
"loss": 0.2144,
"step": 494
},
{
"epoch": 0.7290132547864506,
"grad_norm": 2.89328239935156,
"learning_rate": 7.064632583534918e-06,
"loss": 0.2031,
"step": 495
},
{
"epoch": 0.7304860088365243,
"grad_norm": 2.641040101207805,
"learning_rate": 7.05409227224859e-06,
"loss": 0.173,
"step": 496
},
{
"epoch": 0.7319587628865979,
"grad_norm": 3.2517271158299246,
"learning_rate": 7.043540967867782e-06,
"loss": 0.1974,
"step": 497
},
{
"epoch": 0.7334315169366715,
"grad_norm": 2.519535779252281,
"learning_rate": 7.032978726860981e-06,
"loss": 0.1787,
"step": 498
},
{
"epoch": 0.7349042709867453,
"grad_norm": 2.864687614972071,
"learning_rate": 7.022405605755209e-06,
"loss": 0.2058,
"step": 499
},
{
"epoch": 0.7363770250368189,
"grad_norm": 2.5944755868817557,
"learning_rate": 7.0118216611357125e-06,
"loss": 0.1306,
"step": 500
},
{
"epoch": 0.7363770250368189,
"eval_loss": 0.2079160511493683,
"eval_runtime": 1.3465,
"eval_samples_per_second": 40.846,
"eval_steps_per_second": 10.397,
"step": 500
},
{
"epoch": 0.7378497790868925,
"grad_norm": 3.177064630921787,
"learning_rate": 7.001226949645663e-06,
"loss": 0.174,
"step": 501
},
{
"epoch": 0.7393225331369662,
"grad_norm": 2.816317102863076,
"learning_rate": 6.990621527985856e-06,
"loss": 0.1435,
"step": 502
},
{
"epoch": 0.7407952871870398,
"grad_norm": 2.792806952818793,
"learning_rate": 6.980005452914404e-06,
"loss": 0.1862,
"step": 503
},
{
"epoch": 0.7422680412371134,
"grad_norm": 2.564807443158852,
"learning_rate": 6.969378781246436e-06,
"loss": 0.1802,
"step": 504
},
{
"epoch": 0.7437407952871871,
"grad_norm": 3.2962420837776545,
"learning_rate": 6.958741569853793e-06,
"loss": 0.249,
"step": 505
},
{
"epoch": 0.7452135493372607,
"grad_norm": 3.2860364528743893,
"learning_rate": 6.948093875664719e-06,
"loss": 0.2022,
"step": 506
},
{
"epoch": 0.7466863033873343,
"grad_norm": 2.810461430677853,
"learning_rate": 6.937435755663561e-06,
"loss": 0.1821,
"step": 507
},
{
"epoch": 0.748159057437408,
"grad_norm": 3.0512682831720994,
"learning_rate": 6.926767266890466e-06,
"loss": 0.2206,
"step": 508
},
{
"epoch": 0.7496318114874816,
"grad_norm": 2.8042132115591323,
"learning_rate": 6.916088466441068e-06,
"loss": 0.1556,
"step": 509
},
{
"epoch": 0.7511045655375552,
"grad_norm": 3.0371364783700803,
"learning_rate": 6.90539941146619e-06,
"loss": 0.1739,
"step": 510
},
{
"epoch": 0.7525773195876289,
"grad_norm": 2.569775234654742,
"learning_rate": 6.894700159171535e-06,
"loss": 0.1689,
"step": 511
},
{
"epoch": 0.7540500736377025,
"grad_norm": 2.933321528237316,
"learning_rate": 6.883990766817378e-06,
"loss": 0.203,
"step": 512
},
{
"epoch": 0.7555228276877761,
"grad_norm": 2.954252427120007,
"learning_rate": 6.8732712917182645e-06,
"loss": 0.1905,
"step": 513
},
{
"epoch": 0.7569955817378498,
"grad_norm": 3.094551675182008,
"learning_rate": 6.862541791242698e-06,
"loss": 0.1982,
"step": 514
},
{
"epoch": 0.7584683357879234,
"grad_norm": 2.589524276912232,
"learning_rate": 6.851802322812839e-06,
"loss": 0.1524,
"step": 515
},
{
"epoch": 0.759941089837997,
"grad_norm": 2.6756348976293958,
"learning_rate": 6.84105294390419e-06,
"loss": 0.1643,
"step": 516
},
{
"epoch": 0.7614138438880707,
"grad_norm": 2.1674746654767927,
"learning_rate": 6.8302937120453e-06,
"loss": 0.1753,
"step": 517
},
{
"epoch": 0.7628865979381443,
"grad_norm": 2.340644234009506,
"learning_rate": 6.819524684817439e-06,
"loss": 0.1601,
"step": 518
},
{
"epoch": 0.7643593519882179,
"grad_norm": 2.327340807374574,
"learning_rate": 6.808745919854307e-06,
"loss": 0.1483,
"step": 519
},
{
"epoch": 0.7658321060382917,
"grad_norm": 3.250904343809451,
"learning_rate": 6.797957474841717e-06,
"loss": 0.215,
"step": 520
},
{
"epoch": 0.7673048600883653,
"grad_norm": 2.5391802727966977,
"learning_rate": 6.787159407517285e-06,
"loss": 0.1823,
"step": 521
},
{
"epoch": 0.7687776141384389,
"grad_norm": 2.7916115575921485,
"learning_rate": 6.776351775670129e-06,
"loss": 0.2174,
"step": 522
},
{
"epoch": 0.7702503681885126,
"grad_norm": 2.7858618390144767,
"learning_rate": 6.765534637140551e-06,
"loss": 0.1957,
"step": 523
},
{
"epoch": 0.7717231222385862,
"grad_norm": 3.5767299189193116,
"learning_rate": 6.754708049819728e-06,
"loss": 0.2414,
"step": 524
},
{
"epoch": 0.7731958762886598,
"grad_norm": 2.858342695786206,
"learning_rate": 6.743872071649411e-06,
"loss": 0.1901,
"step": 525
},
{
"epoch": 0.7746686303387335,
"grad_norm": 2.8252337439834108,
"learning_rate": 6.733026760621607e-06,
"loss": 0.1992,
"step": 526
},
{
"epoch": 0.7761413843888071,
"grad_norm": 2.3041203023856793,
"learning_rate": 6.722172174778267e-06,
"loss": 0.138,
"step": 527
},
{
"epoch": 0.7776141384388807,
"grad_norm": 3.6433507462583883,
"learning_rate": 6.711308372210983e-06,
"loss": 0.281,
"step": 528
},
{
"epoch": 0.7790868924889544,
"grad_norm": 3.826707632819319,
"learning_rate": 6.700435411060674e-06,
"loss": 0.2379,
"step": 529
},
{
"epoch": 0.780559646539028,
"grad_norm": 2.335468844981763,
"learning_rate": 6.689553349517268e-06,
"loss": 0.1426,
"step": 530
},
{
"epoch": 0.7820324005891016,
"grad_norm": 3.1999212336171348,
"learning_rate": 6.678662245819401e-06,
"loss": 0.2075,
"step": 531
},
{
"epoch": 0.7835051546391752,
"grad_norm": 2.670072436287972,
"learning_rate": 6.667762158254104e-06,
"loss": 0.1511,
"step": 532
},
{
"epoch": 0.7849779086892489,
"grad_norm": 3.0405487574452588,
"learning_rate": 6.65685314515648e-06,
"loss": 0.2754,
"step": 533
},
{
"epoch": 0.7864506627393225,
"grad_norm": 2.9059557625759713,
"learning_rate": 6.645935264909404e-06,
"loss": 0.1894,
"step": 534
},
{
"epoch": 0.7879234167893961,
"grad_norm": 3.1020231365165976,
"learning_rate": 6.635008575943208e-06,
"loss": 0.2202,
"step": 535
},
{
"epoch": 0.7893961708394698,
"grad_norm": 2.163415012871819,
"learning_rate": 6.6240731367353624e-06,
"loss": 0.1343,
"step": 536
},
{
"epoch": 0.7908689248895434,
"grad_norm": 3.095956057328875,
"learning_rate": 6.6131290058101696e-06,
"loss": 0.1618,
"step": 537
},
{
"epoch": 0.792341678939617,
"grad_norm": 2.934986128550977,
"learning_rate": 6.602176241738449e-06,
"loss": 0.227,
"step": 538
},
{
"epoch": 0.7938144329896907,
"grad_norm": 2.7997400039218348,
"learning_rate": 6.591214903137221e-06,
"loss": 0.1882,
"step": 539
},
{
"epoch": 0.7952871870397643,
"grad_norm": 3.12439323702351,
"learning_rate": 6.580245048669395e-06,
"loss": 0.1969,
"step": 540
},
{
"epoch": 0.7967599410898379,
"grad_norm": 3.2306163155596415,
"learning_rate": 6.569266737043459e-06,
"loss": 0.2152,
"step": 541
},
{
"epoch": 0.7982326951399117,
"grad_norm": 3.4132440767323424,
"learning_rate": 6.558280027013155e-06,
"loss": 0.2407,
"step": 542
},
{
"epoch": 0.7997054491899853,
"grad_norm": 2.8084491999693264,
"learning_rate": 6.547284977377182e-06,
"loss": 0.1558,
"step": 543
},
{
"epoch": 0.801178203240059,
"grad_norm": 2.4808253484818907,
"learning_rate": 6.536281646978863e-06,
"loss": 0.2054,
"step": 544
},
{
"epoch": 0.8026509572901326,
"grad_norm": 3.3744247435198833,
"learning_rate": 6.525270094705838e-06,
"loss": 0.1971,
"step": 545
},
{
"epoch": 0.8041237113402062,
"grad_norm": 3.127584895400644,
"learning_rate": 6.514250379489754e-06,
"loss": 0.1756,
"step": 546
},
{
"epoch": 0.8055964653902798,
"grad_norm": 2.143280586687572,
"learning_rate": 6.503222560305941e-06,
"loss": 0.1301,
"step": 547
},
{
"epoch": 0.8070692194403535,
"grad_norm": 2.778289265850501,
"learning_rate": 6.492186696173097e-06,
"loss": 0.1746,
"step": 548
},
{
"epoch": 0.8085419734904271,
"grad_norm": 2.9826518884459925,
"learning_rate": 6.481142846152982e-06,
"loss": 0.2291,
"step": 549
},
{
"epoch": 0.8100147275405007,
"grad_norm": 3.6069313230398463,
"learning_rate": 6.47009106935009e-06,
"loss": 0.2182,
"step": 550
},
{
"epoch": 0.8114874815905744,
"grad_norm": 2.9764046649935785,
"learning_rate": 6.45903142491134e-06,
"loss": 0.1821,
"step": 551
},
{
"epoch": 0.812960235640648,
"grad_norm": 2.7725921223575987,
"learning_rate": 6.447963972025752e-06,
"loss": 0.218,
"step": 552
},
{
"epoch": 0.8144329896907216,
"grad_norm": 2.7051072823188362,
"learning_rate": 6.436888769924142e-06,
"loss": 0.159,
"step": 553
},
{
"epoch": 0.8159057437407953,
"grad_norm": 3.0114305097475422,
"learning_rate": 6.425805877878794e-06,
"loss": 0.1807,
"step": 554
},
{
"epoch": 0.8173784977908689,
"grad_norm": 2.506441076459785,
"learning_rate": 6.414715355203149e-06,
"loss": 0.1519,
"step": 555
},
{
"epoch": 0.8188512518409425,
"grad_norm": 2.5973980936466896,
"learning_rate": 6.403617261251485e-06,
"loss": 0.1879,
"step": 556
},
{
"epoch": 0.8203240058910162,
"grad_norm": 2.7762712571831107,
"learning_rate": 6.392511655418599e-06,
"loss": 0.1525,
"step": 557
},
{
"epoch": 0.8217967599410898,
"grad_norm": 2.828187998348971,
"learning_rate": 6.381398597139492e-06,
"loss": 0.2317,
"step": 558
},
{
"epoch": 0.8232695139911634,
"grad_norm": 2.6972269121488073,
"learning_rate": 6.370278145889048e-06,
"loss": 0.186,
"step": 559
},
{
"epoch": 0.8247422680412371,
"grad_norm": 3.542127036239562,
"learning_rate": 6.3591503611817155e-06,
"loss": 0.2108,
"step": 560
},
{
"epoch": 0.8262150220913107,
"grad_norm": 2.8818914546163565,
"learning_rate": 6.348015302571192e-06,
"loss": 0.1946,
"step": 561
},
{
"epoch": 0.8276877761413843,
"grad_norm": 3.2379832338019026,
"learning_rate": 6.336873029650104e-06,
"loss": 0.2005,
"step": 562
},
{
"epoch": 0.8291605301914581,
"grad_norm": 2.834420288726562,
"learning_rate": 6.3257236020496845e-06,
"loss": 0.1567,
"step": 563
},
{
"epoch": 0.8306332842415317,
"grad_norm": 2.8967220378684586,
"learning_rate": 6.3145670794394595e-06,
"loss": 0.221,
"step": 564
},
{
"epoch": 0.8321060382916053,
"grad_norm": 2.4062803602132066,
"learning_rate": 6.303403521526928e-06,
"loss": 0.1897,
"step": 565
},
{
"epoch": 0.833578792341679,
"grad_norm": 3.085110345051428,
"learning_rate": 6.292232988057235e-06,
"loss": 0.2009,
"step": 566
},
{
"epoch": 0.8350515463917526,
"grad_norm": 2.7255177270072988,
"learning_rate": 6.281055538812861e-06,
"loss": 0.1687,
"step": 567
},
{
"epoch": 0.8365243004418262,
"grad_norm": 2.6403632558800387,
"learning_rate": 6.2698712336133e-06,
"loss": 0.1862,
"step": 568
},
{
"epoch": 0.8379970544918999,
"grad_norm": 3.1369325934703514,
"learning_rate": 6.2586801323147314e-06,
"loss": 0.2388,
"step": 569
},
{
"epoch": 0.8394698085419735,
"grad_norm": 2.610228596413145,
"learning_rate": 6.247482294809712e-06,
"loss": 0.172,
"step": 570
},
{
"epoch": 0.8409425625920471,
"grad_norm": 2.728140959550517,
"learning_rate": 6.236277781026849e-06,
"loss": 0.1599,
"step": 571
},
{
"epoch": 0.8424153166421208,
"grad_norm": 3.3397424326917933,
"learning_rate": 6.225066650930476e-06,
"loss": 0.2081,
"step": 572
},
{
"epoch": 0.8438880706921944,
"grad_norm": 2.133794920897429,
"learning_rate": 6.213848964520338e-06,
"loss": 0.1303,
"step": 573
},
{
"epoch": 0.845360824742268,
"grad_norm": 2.8835463225639497,
"learning_rate": 6.202624781831269e-06,
"loss": 0.1654,
"step": 574
},
{
"epoch": 0.8468335787923417,
"grad_norm": 2.367932024564104,
"learning_rate": 6.191394162932867e-06,
"loss": 0.1774,
"step": 575
},
{
"epoch": 0.8483063328424153,
"grad_norm": 2.7740754042726317,
"learning_rate": 6.18015716792918e-06,
"loss": 0.2079,
"step": 576
},
{
"epoch": 0.8497790868924889,
"grad_norm": 2.4191844169668695,
"learning_rate": 6.168913856958374e-06,
"loss": 0.1544,
"step": 577
},
{
"epoch": 0.8512518409425626,
"grad_norm": 3.1026655189630326,
"learning_rate": 6.157664290192421e-06,
"loss": 0.2481,
"step": 578
},
{
"epoch": 0.8527245949926362,
"grad_norm": 2.8896911560329723,
"learning_rate": 6.146408527836771e-06,
"loss": 0.1596,
"step": 579
},
{
"epoch": 0.8541973490427098,
"grad_norm": 2.8646880811489637,
"learning_rate": 6.135146630130033e-06,
"loss": 0.1955,
"step": 580
},
{
"epoch": 0.8556701030927835,
"grad_norm": 3.3949249128943157,
"learning_rate": 6.123878657343648e-06,
"loss": 0.2115,
"step": 581
},
{
"epoch": 0.8571428571428571,
"grad_norm": 2.9508305883194987,
"learning_rate": 6.112604669781572e-06,
"loss": 0.1661,
"step": 582
},
{
"epoch": 0.8586156111929307,
"grad_norm": 3.2778977717650495,
"learning_rate": 6.101324727779954e-06,
"loss": 0.2135,
"step": 583
},
{
"epoch": 0.8600883652430045,
"grad_norm": 2.2927362430201494,
"learning_rate": 6.090038891706801e-06,
"loss": 0.1652,
"step": 584
},
{
"epoch": 0.8615611192930781,
"grad_norm": 3.3248234221138184,
"learning_rate": 6.078747221961675e-06,
"loss": 0.2145,
"step": 585
},
{
"epoch": 0.8630338733431517,
"grad_norm": 3.200462125587272,
"learning_rate": 6.06744977897535e-06,
"loss": 0.1932,
"step": 586
},
{
"epoch": 0.8645066273932254,
"grad_norm": 2.623581099942445,
"learning_rate": 6.0561466232095e-06,
"loss": 0.191,
"step": 587
},
{
"epoch": 0.865979381443299,
"grad_norm": 2.825621335260916,
"learning_rate": 6.044837815156377e-06,
"loss": 0.2154,
"step": 588
},
{
"epoch": 0.8674521354933726,
"grad_norm": 2.8020502133871013,
"learning_rate": 6.033523415338473e-06,
"loss": 0.1903,
"step": 589
},
{
"epoch": 0.8689248895434463,
"grad_norm": 2.858829763733425,
"learning_rate": 6.022203484308217e-06,
"loss": 0.1733,
"step": 590
},
{
"epoch": 0.8703976435935199,
"grad_norm": 2.4940821640154103,
"learning_rate": 6.010878082647631e-06,
"loss": 0.1587,
"step": 591
},
{
"epoch": 0.8718703976435935,
"grad_norm": 2.5652459900784863,
"learning_rate": 5.999547270968024e-06,
"loss": 0.1382,
"step": 592
},
{
"epoch": 0.8733431516936672,
"grad_norm": 2.6485105297629445,
"learning_rate": 5.988211109909647e-06,
"loss": 0.1678,
"step": 593
},
{
"epoch": 0.8748159057437408,
"grad_norm": 2.6150060427654487,
"learning_rate": 5.976869660141389e-06,
"loss": 0.1787,
"step": 594
},
{
"epoch": 0.8762886597938144,
"grad_norm": 2.315123548048991,
"learning_rate": 5.965522982360441e-06,
"loss": 0.1477,
"step": 595
},
{
"epoch": 0.8777614138438881,
"grad_norm": 3.019927103557956,
"learning_rate": 5.954171137291968e-06,
"loss": 0.1771,
"step": 596
},
{
"epoch": 0.8792341678939617,
"grad_norm": 2.608181929581868,
"learning_rate": 5.942814185688799e-06,
"loss": 0.1553,
"step": 597
},
{
"epoch": 0.8807069219440353,
"grad_norm": 2.8420856678687567,
"learning_rate": 5.931452188331084e-06,
"loss": 0.1862,
"step": 598
},
{
"epoch": 0.882179675994109,
"grad_norm": 3.3885149091071187,
"learning_rate": 5.920085206025979e-06,
"loss": 0.1709,
"step": 599
},
{
"epoch": 0.8836524300441826,
"grad_norm": 2.9530752607122714,
"learning_rate": 5.908713299607318e-06,
"loss": 0.1717,
"step": 600
},
{
"epoch": 0.8851251840942562,
"grad_norm": 3.3443040794874657,
"learning_rate": 5.897336529935292e-06,
"loss": 0.2336,
"step": 601
},
{
"epoch": 0.8865979381443299,
"grad_norm": 2.803553171269139,
"learning_rate": 5.885954957896115e-06,
"loss": 0.1811,
"step": 602
},
{
"epoch": 0.8880706921944035,
"grad_norm": 2.462103288691291,
"learning_rate": 5.874568644401702e-06,
"loss": 0.1657,
"step": 603
},
{
"epoch": 0.8895434462444771,
"grad_norm": 2.97521020574475,
"learning_rate": 5.863177650389346e-06,
"loss": 0.1821,
"step": 604
},
{
"epoch": 0.8910162002945509,
"grad_norm": 2.3362815268370145,
"learning_rate": 5.851782036821387e-06,
"loss": 0.1177,
"step": 605
},
{
"epoch": 0.8924889543446245,
"grad_norm": 2.697364125595746,
"learning_rate": 5.840381864684892e-06,
"loss": 0.2245,
"step": 606
},
{
"epoch": 0.8939617083946981,
"grad_norm": 3.1676386145436304,
"learning_rate": 5.828977194991318e-06,
"loss": 0.258,
"step": 607
},
{
"epoch": 0.8954344624447718,
"grad_norm": 2.9439386755156307,
"learning_rate": 5.817568088776195e-06,
"loss": 0.198,
"step": 608
},
{
"epoch": 0.8969072164948454,
"grad_norm": 3.116267212976988,
"learning_rate": 5.806154607098799e-06,
"loss": 0.2417,
"step": 609
},
{
"epoch": 0.898379970544919,
"grad_norm": 3.0874912391485805,
"learning_rate": 5.794736811041821e-06,
"loss": 0.1924,
"step": 610
},
{
"epoch": 0.8998527245949927,
"grad_norm": 3.121816238099267,
"learning_rate": 5.783314761711038e-06,
"loss": 0.213,
"step": 611
},
{
"epoch": 0.9013254786450663,
"grad_norm": 2.259615605065723,
"learning_rate": 5.771888520234997e-06,
"loss": 0.1352,
"step": 612
},
{
"epoch": 0.9027982326951399,
"grad_norm": 2.880010684013978,
"learning_rate": 5.760458147764673e-06,
"loss": 0.161,
"step": 613
},
{
"epoch": 0.9042709867452136,
"grad_norm": 2.281221094404714,
"learning_rate": 5.749023705473154e-06,
"loss": 0.1812,
"step": 614
},
{
"epoch": 0.9057437407952872,
"grad_norm": 3.7696229509532238,
"learning_rate": 5.737585254555307e-06,
"loss": 0.1914,
"step": 615
},
{
"epoch": 0.9072164948453608,
"grad_norm": 2.946395578493942,
"learning_rate": 5.726142856227453e-06,
"loss": 0.1815,
"step": 616
},
{
"epoch": 0.9086892488954345,
"grad_norm": 2.9293787170665095,
"learning_rate": 5.714696571727037e-06,
"loss": 0.1769,
"step": 617
},
{
"epoch": 0.9101620029455081,
"grad_norm": 2.3808186014335573,
"learning_rate": 5.703246462312307e-06,
"loss": 0.1362,
"step": 618
},
{
"epoch": 0.9116347569955817,
"grad_norm": 2.956481152311808,
"learning_rate": 5.6917925892619775e-06,
"loss": 0.1745,
"step": 619
},
{
"epoch": 0.9131075110456554,
"grad_norm": 3.4406449283229814,
"learning_rate": 5.680335013874903e-06,
"loss": 0.257,
"step": 620
},
{
"epoch": 0.914580265095729,
"grad_norm": 3.079800184193429,
"learning_rate": 5.668873797469756e-06,
"loss": 0.167,
"step": 621
},
{
"epoch": 0.9160530191458026,
"grad_norm": 3.253296370089598,
"learning_rate": 5.657409001384695e-06,
"loss": 0.1757,
"step": 622
},
{
"epoch": 0.9175257731958762,
"grad_norm": 2.156710713498128,
"learning_rate": 5.645940686977033e-06,
"loss": 0.1115,
"step": 623
},
{
"epoch": 0.9189985272459499,
"grad_norm": 2.6144980122779025,
"learning_rate": 5.634468915622915e-06,
"loss": 0.1427,
"step": 624
},
{
"epoch": 0.9204712812960235,
"grad_norm": 2.790011108638482,
"learning_rate": 5.622993748716987e-06,
"loss": 0.1843,
"step": 625
},
{
"epoch": 0.9219440353460973,
"grad_norm": 2.48624098429388,
"learning_rate": 5.611515247672063e-06,
"loss": 0.1486,
"step": 626
},
{
"epoch": 0.9234167893961709,
"grad_norm": 2.91750565490176,
"learning_rate": 5.600033473918811e-06,
"loss": 0.1966,
"step": 627
},
{
"epoch": 0.9248895434462445,
"grad_norm": 2.8892768676541767,
"learning_rate": 5.588548488905402e-06,
"loss": 0.1658,
"step": 628
},
{
"epoch": 0.9263622974963182,
"grad_norm": 2.6350547691066986,
"learning_rate": 5.577060354097199e-06,
"loss": 0.2129,
"step": 629
},
{
"epoch": 0.9278350515463918,
"grad_norm": 3.1772750636856455,
"learning_rate": 5.5655691309764225e-06,
"loss": 0.166,
"step": 630
},
{
"epoch": 0.9293078055964654,
"grad_norm": 2.975003521974751,
"learning_rate": 5.554074881041818e-06,
"loss": 0.1699,
"step": 631
},
{
"epoch": 0.930780559646539,
"grad_norm": 2.3653644886736513,
"learning_rate": 5.542577665808332e-06,
"loss": 0.1725,
"step": 632
},
{
"epoch": 0.9322533136966127,
"grad_norm": 3.221238075347828,
"learning_rate": 5.531077546806783e-06,
"loss": 0.2137,
"step": 633
},
{
"epoch": 0.9337260677466863,
"grad_norm": 2.8452909275134806,
"learning_rate": 5.519574585583523e-06,
"loss": 0.1879,
"step": 634
},
{
"epoch": 0.93519882179676,
"grad_norm": 2.9331185445999535,
"learning_rate": 5.508068843700121e-06,
"loss": 0.2018,
"step": 635
},
{
"epoch": 0.9366715758468336,
"grad_norm": 2.5423046775907276,
"learning_rate": 5.496560382733028e-06,
"loss": 0.1654,
"step": 636
},
{
"epoch": 0.9381443298969072,
"grad_norm": 2.886759957099059,
"learning_rate": 5.485049264273241e-06,
"loss": 0.198,
"step": 637
},
{
"epoch": 0.9396170839469808,
"grad_norm": 2.4925402691142517,
"learning_rate": 5.473535549925986e-06,
"loss": 0.1308,
"step": 638
},
{
"epoch": 0.9410898379970545,
"grad_norm": 3.041109792137434,
"learning_rate": 5.462019301310378e-06,
"loss": 0.1956,
"step": 639
},
{
"epoch": 0.9425625920471281,
"grad_norm": 3.1775307076021333,
"learning_rate": 5.450500580059095e-06,
"loss": 0.2047,
"step": 640
},
{
"epoch": 0.9440353460972017,
"grad_norm": 2.6506780197656865,
"learning_rate": 5.438979447818049e-06,
"loss": 0.1625,
"step": 641
},
{
"epoch": 0.9455081001472754,
"grad_norm": 3.1124656946929234,
"learning_rate": 5.427455966246057e-06,
"loss": 0.1784,
"step": 642
},
{
"epoch": 0.946980854197349,
"grad_norm": 2.4473149482228402,
"learning_rate": 5.415930197014503e-06,
"loss": 0.1813,
"step": 643
},
{
"epoch": 0.9484536082474226,
"grad_norm": 2.678785367089142,
"learning_rate": 5.404402201807022e-06,
"loss": 0.174,
"step": 644
},
{
"epoch": 0.9499263622974963,
"grad_norm": 2.2719072811027012,
"learning_rate": 5.392872042319155e-06,
"loss": 0.1311,
"step": 645
},
{
"epoch": 0.9513991163475699,
"grad_norm": 3.150733238587214,
"learning_rate": 5.381339780258034e-06,
"loss": 0.2442,
"step": 646
},
{
"epoch": 0.9528718703976435,
"grad_norm": 3.5455424017999166,
"learning_rate": 5.369805477342032e-06,
"loss": 0.2337,
"step": 647
},
{
"epoch": 0.9543446244477173,
"grad_norm": 2.7482737086217135,
"learning_rate": 5.358269195300454e-06,
"loss": 0.2175,
"step": 648
},
{
"epoch": 0.9558173784977909,
"grad_norm": 2.776798925728169,
"learning_rate": 5.346730995873194e-06,
"loss": 0.1611,
"step": 649
},
{
"epoch": 0.9572901325478645,
"grad_norm": 2.905049948649321,
"learning_rate": 5.335190940810407e-06,
"loss": 0.1973,
"step": 650
},
{
"epoch": 0.9587628865979382,
"grad_norm": 2.9860800230634283,
"learning_rate": 5.323649091872179e-06,
"loss": 0.2157,
"step": 651
},
{
"epoch": 0.9602356406480118,
"grad_norm": 3.056382593251139,
"learning_rate": 5.312105510828196e-06,
"loss": 0.1854,
"step": 652
},
{
"epoch": 0.9617083946980854,
"grad_norm": 1.8633584294604082,
"learning_rate": 5.300560259457414e-06,
"loss": 0.1139,
"step": 653
},
{
"epoch": 0.9631811487481591,
"grad_norm": 2.8115313634091956,
"learning_rate": 5.289013399547732e-06,
"loss": 0.1708,
"step": 654
},
{
"epoch": 0.9646539027982327,
"grad_norm": 2.8530294541287256,
"learning_rate": 5.27746499289565e-06,
"loss": 0.1731,
"step": 655
},
{
"epoch": 0.9661266568483063,
"grad_norm": 2.2259269099856542,
"learning_rate": 5.265915101305952e-06,
"loss": 0.1111,
"step": 656
},
{
"epoch": 0.96759941089838,
"grad_norm": 3.061910370458147,
"learning_rate": 5.254363786591368e-06,
"loss": 0.2037,
"step": 657
},
{
"epoch": 0.9690721649484536,
"grad_norm": 2.6756366507310623,
"learning_rate": 5.242811110572243e-06,
"loss": 0.2021,
"step": 658
},
{
"epoch": 0.9705449189985272,
"grad_norm": 3.1881851785691486,
"learning_rate": 5.231257135076205e-06,
"loss": 0.1759,
"step": 659
},
{
"epoch": 0.9720176730486009,
"grad_norm": 2.6359163663678964,
"learning_rate": 5.219701921937845e-06,
"loss": 0.1518,
"step": 660
},
{
"epoch": 0.9734904270986745,
"grad_norm": 2.51301787921435,
"learning_rate": 5.208145532998369e-06,
"loss": 0.1483,
"step": 661
},
{
"epoch": 0.9749631811487481,
"grad_norm": 3.025821223156431,
"learning_rate": 5.196588030105278e-06,
"loss": 0.1493,
"step": 662
},
{
"epoch": 0.9764359351988218,
"grad_norm": 3.185702920503757,
"learning_rate": 5.185029475112038e-06,
"loss": 0.2023,
"step": 663
},
{
"epoch": 0.9779086892488954,
"grad_norm": 2.096280595044242,
"learning_rate": 5.173469929877741e-06,
"loss": 0.149,
"step": 664
},
{
"epoch": 0.979381443298969,
"grad_norm": 2.5060157239018617,
"learning_rate": 5.161909456266781e-06,
"loss": 0.1249,
"step": 665
},
{
"epoch": 0.9808541973490427,
"grad_norm": 3.2185960894603602,
"learning_rate": 5.1503481161485206e-06,
"loss": 0.2196,
"step": 666
},
{
"epoch": 0.9823269513991163,
"grad_norm": 2.6663010480518428,
"learning_rate": 5.138785971396959e-06,
"loss": 0.2039,
"step": 667
},
{
"epoch": 0.9837997054491899,
"grad_norm": 2.8699961471842386,
"learning_rate": 5.127223083890402e-06,
"loss": 0.1581,
"step": 668
},
{
"epoch": 0.9852724594992637,
"grad_norm": 2.7556015653714634,
"learning_rate": 5.11565951551113e-06,
"loss": 0.1703,
"step": 669
},
{
"epoch": 0.9867452135493373,
"grad_norm": 3.5065573085381447,
"learning_rate": 5.104095328145069e-06,
"loss": 0.2098,
"step": 670
},
{
"epoch": 0.9882179675994109,
"grad_norm": 2.6616463680002824,
"learning_rate": 5.0925305836814546e-06,
"loss": 0.1487,
"step": 671
},
{
"epoch": 0.9896907216494846,
"grad_norm": 2.8133418788027753,
"learning_rate": 5.080965344012509e-06,
"loss": 0.1335,
"step": 672
},
{
"epoch": 0.9911634756995582,
"grad_norm": 2.9595738356622103,
"learning_rate": 5.069399671033096e-06,
"loss": 0.1642,
"step": 673
},
{
"epoch": 0.9926362297496318,
"grad_norm": 2.6728211673881073,
"learning_rate": 5.0578336266404085e-06,
"loss": 0.1389,
"step": 674
},
{
"epoch": 0.9941089837997055,
"grad_norm": 3.080439476388202,
"learning_rate": 5.046267272733621e-06,
"loss": 0.1965,
"step": 675
},
{
"epoch": 0.9955817378497791,
"grad_norm": 2.4701410992374524,
"learning_rate": 5.034700671213565e-06,
"loss": 0.1405,
"step": 676
},
{
"epoch": 0.9970544918998527,
"grad_norm": 2.84391651358579,
"learning_rate": 5.023133883982398e-06,
"loss": 0.1691,
"step": 677
},
{
"epoch": 0.9985272459499264,
"grad_norm": 2.8974978669512153,
"learning_rate": 5.0115669729432725e-06,
"loss": 0.1626,
"step": 678
},
{
"epoch": 1.0,
"grad_norm": 2.6071750843239125,
"learning_rate": 5e-06,
"loss": 0.171,
"step": 679
},
{
"epoch": 1.0014727540500736,
"grad_norm": 2.0908722198043987,
"learning_rate": 4.988433027056729e-06,
"loss": 0.0877,
"step": 680
},
{
"epoch": 1.0029455081001473,
"grad_norm": 2.3075032162377798,
"learning_rate": 4.976866116017604e-06,
"loss": 0.0878,
"step": 681
},
{
"epoch": 1.004418262150221,
"grad_norm": 1.8339841498407248,
"learning_rate": 4.965299328786437e-06,
"loss": 0.0837,
"step": 682
},
{
"epoch": 1.0058910162002945,
"grad_norm": 1.6006550769733303,
"learning_rate": 4.95373272726638e-06,
"loss": 0.08,
"step": 683
},
{
"epoch": 1.0073637702503682,
"grad_norm": 2.3923089526567223,
"learning_rate": 4.942166373359593e-06,
"loss": 0.0954,
"step": 684
},
{
"epoch": 1.0088365243004418,
"grad_norm": 1.8073001932691344,
"learning_rate": 4.930600328966904e-06,
"loss": 0.0534,
"step": 685
},
{
"epoch": 1.0103092783505154,
"grad_norm": 2.112034239924799,
"learning_rate": 4.919034655987493e-06,
"loss": 0.0712,
"step": 686
},
{
"epoch": 1.011782032400589,
"grad_norm": 1.9019850417584385,
"learning_rate": 4.907469416318547e-06,
"loss": 0.072,
"step": 687
},
{
"epoch": 1.0132547864506627,
"grad_norm": 2.0877136424715577,
"learning_rate": 4.895904671854933e-06,
"loss": 0.0771,
"step": 688
},
{
"epoch": 1.0147275405007363,
"grad_norm": 2.077572121775792,
"learning_rate": 4.884340484488872e-06,
"loss": 0.0713,
"step": 689
},
{
"epoch": 1.01620029455081,
"grad_norm": 2.7869591073328075,
"learning_rate": 4.872776916109601e-06,
"loss": 0.1134,
"step": 690
},
{
"epoch": 1.0176730486008836,
"grad_norm": 1.8746902344963947,
"learning_rate": 4.861214028603044e-06,
"loss": 0.0571,
"step": 691
},
{
"epoch": 1.0191458026509572,
"grad_norm": 2.6069603988935297,
"learning_rate": 4.849651883851482e-06,
"loss": 0.0868,
"step": 692
},
{
"epoch": 1.0206185567010309,
"grad_norm": 2.740480215432774,
"learning_rate": 4.838090543733222e-06,
"loss": 0.0948,
"step": 693
},
{
"epoch": 1.0220913107511045,
"grad_norm": 3.8793953206281833,
"learning_rate": 4.826530070122262e-06,
"loss": 0.07,
"step": 694
},
{
"epoch": 1.0235640648011781,
"grad_norm": 2.775139749937407,
"learning_rate": 4.814970524887965e-06,
"loss": 0.0927,
"step": 695
},
{
"epoch": 1.0250368188512518,
"grad_norm": 3.085066313685762,
"learning_rate": 4.8034119698947244e-06,
"loss": 0.094,
"step": 696
},
{
"epoch": 1.0265095729013254,
"grad_norm": 3.281643571176706,
"learning_rate": 4.791854467001634e-06,
"loss": 0.0658,
"step": 697
},
{
"epoch": 1.027982326951399,
"grad_norm": 2.4078985119085634,
"learning_rate": 4.780298078062157e-06,
"loss": 0.0557,
"step": 698
},
{
"epoch": 1.0294550810014726,
"grad_norm": 2.7060207713139923,
"learning_rate": 4.768742864923797e-06,
"loss": 0.048,
"step": 699
},
{
"epoch": 1.0309278350515463,
"grad_norm": 2.856556717889814,
"learning_rate": 4.757188889427761e-06,
"loss": 0.06,
"step": 700
},
{
"epoch": 1.0324005891016201,
"grad_norm": 3.159586197772598,
"learning_rate": 4.745636213408633e-06,
"loss": 0.0682,
"step": 701
},
{
"epoch": 1.0338733431516938,
"grad_norm": 2.932631552062124,
"learning_rate": 4.734084898694049e-06,
"loss": 0.0803,
"step": 702
},
{
"epoch": 1.0353460972017674,
"grad_norm": 3.076181585290304,
"learning_rate": 4.72253500710435e-06,
"loss": 0.0675,
"step": 703
},
{
"epoch": 1.036818851251841,
"grad_norm": 2.8918203833899003,
"learning_rate": 4.710986600452269e-06,
"loss": 0.0615,
"step": 704
},
{
"epoch": 1.0382916053019147,
"grad_norm": 2.4534677540942407,
"learning_rate": 4.699439740542586e-06,
"loss": 0.0611,
"step": 705
},
{
"epoch": 1.0397643593519883,
"grad_norm": 3.277146390134441,
"learning_rate": 4.687894489171804e-06,
"loss": 0.0934,
"step": 706
},
{
"epoch": 1.041237113402062,
"grad_norm": 2.9290800449770926,
"learning_rate": 4.6763509081278215e-06,
"loss": 0.0877,
"step": 707
},
{
"epoch": 1.0427098674521356,
"grad_norm": 2.451559748967979,
"learning_rate": 4.664809059189594e-06,
"loss": 0.0655,
"step": 708
},
{
"epoch": 1.0441826215022092,
"grad_norm": 3.0014355915899382,
"learning_rate": 4.653269004126806e-06,
"loss": 0.0984,
"step": 709
},
{
"epoch": 1.0456553755522828,
"grad_norm": 2.9636564956506803,
"learning_rate": 4.641730804699547e-06,
"loss": 0.0893,
"step": 710
},
{
"epoch": 1.0471281296023565,
"grad_norm": 2.668332309680153,
"learning_rate": 4.63019452265797e-06,
"loss": 0.0704,
"step": 711
},
{
"epoch": 1.04860088365243,
"grad_norm": 2.025496591492349,
"learning_rate": 4.618660219741968e-06,
"loss": 0.049,
"step": 712
},
{
"epoch": 1.0500736377025037,
"grad_norm": 2.6980749570347387,
"learning_rate": 4.607127957680846e-06,
"loss": 0.0677,
"step": 713
},
{
"epoch": 1.0515463917525774,
"grad_norm": 3.329232545155469,
"learning_rate": 4.59559779819298e-06,
"loss": 0.0861,
"step": 714
},
{
"epoch": 1.053019145802651,
"grad_norm": 2.8359873347772635,
"learning_rate": 4.584069802985498e-06,
"loss": 0.0755,
"step": 715
},
{
"epoch": 1.0544918998527246,
"grad_norm": 2.274924239132417,
"learning_rate": 4.572544033753945e-06,
"loss": 0.063,
"step": 716
},
{
"epoch": 1.0559646539027983,
"grad_norm": 2.854977227907341,
"learning_rate": 4.561020552181952e-06,
"loss": 0.1293,
"step": 717
},
{
"epoch": 1.0574374079528719,
"grad_norm": 2.8523771548633734,
"learning_rate": 4.549499419940906e-06,
"loss": 0.0763,
"step": 718
},
{
"epoch": 1.0589101620029455,
"grad_norm": 3.1579164863004436,
"learning_rate": 4.537980698689623e-06,
"loss": 0.0792,
"step": 719
},
{
"epoch": 1.0603829160530192,
"grad_norm": 2.087311962701497,
"learning_rate": 4.526464450074016e-06,
"loss": 0.054,
"step": 720
},
{
"epoch": 1.0618556701030928,
"grad_norm": 2.7362138039011126,
"learning_rate": 4.51495073572676e-06,
"loss": 0.0715,
"step": 721
},
{
"epoch": 1.0633284241531664,
"grad_norm": 2.37470137270783,
"learning_rate": 4.503439617266974e-06,
"loss": 0.0563,
"step": 722
},
{
"epoch": 1.06480117820324,
"grad_norm": 3.2907467519259384,
"learning_rate": 4.49193115629988e-06,
"loss": 0.095,
"step": 723
},
{
"epoch": 1.0662739322533137,
"grad_norm": 2.4580532091105622,
"learning_rate": 4.480425414416479e-06,
"loss": 0.072,
"step": 724
},
{
"epoch": 1.0677466863033873,
"grad_norm": 2.430633005701922,
"learning_rate": 4.468922453193219e-06,
"loss": 0.0701,
"step": 725
},
{
"epoch": 1.069219440353461,
"grad_norm": 2.6208550727093485,
"learning_rate": 4.45742233419167e-06,
"loss": 0.0714,
"step": 726
},
{
"epoch": 1.0706921944035346,
"grad_norm": 2.1317122254363667,
"learning_rate": 4.445925118958184e-06,
"loss": 0.0713,
"step": 727
},
{
"epoch": 1.0721649484536082,
"grad_norm": 1.8261168040168063,
"learning_rate": 4.434430869023579e-06,
"loss": 0.0565,
"step": 728
},
{
"epoch": 1.0736377025036818,
"grad_norm": 2.7129508356879652,
"learning_rate": 4.422939645902803e-06,
"loss": 0.087,
"step": 729
},
{
"epoch": 1.0751104565537555,
"grad_norm": 2.4057274548479572,
"learning_rate": 4.4114515110946e-06,
"loss": 0.0655,
"step": 730
},
{
"epoch": 1.076583210603829,
"grad_norm": 2.03399280266938,
"learning_rate": 4.39996652608119e-06,
"loss": 0.0666,
"step": 731
},
{
"epoch": 1.0780559646539027,
"grad_norm": 2.596095403893847,
"learning_rate": 4.3884847523279374e-06,
"loss": 0.0826,
"step": 732
},
{
"epoch": 1.0795287187039764,
"grad_norm": 2.171242877696304,
"learning_rate": 4.377006251283015e-06,
"loss": 0.073,
"step": 733
},
{
"epoch": 1.08100147275405,
"grad_norm": 2.806803218409549,
"learning_rate": 4.365531084377087e-06,
"loss": 0.0785,
"step": 734
},
{
"epoch": 1.0824742268041236,
"grad_norm": 2.62258755991761,
"learning_rate": 4.3540593130229695e-06,
"loss": 0.067,
"step": 735
},
{
"epoch": 1.0839469808541973,
"grad_norm": 2.7392273222794046,
"learning_rate": 4.342590998615308e-06,
"loss": 0.0798,
"step": 736
},
{
"epoch": 1.085419734904271,
"grad_norm": 3.2294373177028115,
"learning_rate": 4.331126202530245e-06,
"loss": 0.0825,
"step": 737
},
{
"epoch": 1.0868924889543445,
"grad_norm": 2.1665291430967253,
"learning_rate": 4.319664986125099e-06,
"loss": 0.0774,
"step": 738
},
{
"epoch": 1.0883652430044182,
"grad_norm": 2.686454697277816,
"learning_rate": 4.308207410738024e-06,
"loss": 0.0837,
"step": 739
},
{
"epoch": 1.0898379970544918,
"grad_norm": 2.7185930478468348,
"learning_rate": 4.296753537687694e-06,
"loss": 0.0688,
"step": 740
},
{
"epoch": 1.0913107511045654,
"grad_norm": 2.154998878381218,
"learning_rate": 4.2853034282729644e-06,
"loss": 0.0488,
"step": 741
},
{
"epoch": 1.0927835051546393,
"grad_norm": 2.476327579315816,
"learning_rate": 4.27385714377255e-06,
"loss": 0.0715,
"step": 742
},
{
"epoch": 1.094256259204713,
"grad_norm": 2.475778910547172,
"learning_rate": 4.2624147454446945e-06,
"loss": 0.0697,
"step": 743
},
{
"epoch": 1.0957290132547866,
"grad_norm": 2.798899972530247,
"learning_rate": 4.250976294526847e-06,
"loss": 0.074,
"step": 744
},
{
"epoch": 1.0972017673048602,
"grad_norm": 2.777019451392523,
"learning_rate": 4.239541852235327e-06,
"loss": 0.0681,
"step": 745
},
{
"epoch": 1.0986745213549338,
"grad_norm": 2.174088157331811,
"learning_rate": 4.228111479765004e-06,
"loss": 0.0672,
"step": 746
},
{
"epoch": 1.1001472754050075,
"grad_norm": 2.7695424502767283,
"learning_rate": 4.216685238288962e-06,
"loss": 0.0855,
"step": 747
},
{
"epoch": 1.101620029455081,
"grad_norm": 2.99079466026466,
"learning_rate": 4.20526318895818e-06,
"loss": 0.0961,
"step": 748
},
{
"epoch": 1.1030927835051547,
"grad_norm": 2.7732745665790275,
"learning_rate": 4.1938453929012014e-06,
"loss": 0.0772,
"step": 749
},
{
"epoch": 1.1045655375552283,
"grad_norm": 1.9954883422550331,
"learning_rate": 4.182431911223806e-06,
"loss": 0.0707,
"step": 750
},
{
"epoch": 1.106038291605302,
"grad_norm": 2.3261727870576516,
"learning_rate": 4.171022805008683e-06,
"loss": 0.0725,
"step": 751
},
{
"epoch": 1.1075110456553756,
"grad_norm": 2.3142670589612306,
"learning_rate": 4.159618135315109e-06,
"loss": 0.0755,
"step": 752
},
{
"epoch": 1.1089837997054492,
"grad_norm": 2.7210466709496894,
"learning_rate": 4.1482179631786126e-06,
"loss": 0.0751,
"step": 753
},
{
"epoch": 1.1104565537555229,
"grad_norm": 2.8409582892005694,
"learning_rate": 4.1368223496106544e-06,
"loss": 0.0982,
"step": 754
},
{
"epoch": 1.1119293078055965,
"grad_norm": 2.5561913135130014,
"learning_rate": 4.125431355598299e-06,
"loss": 0.0942,
"step": 755
},
{
"epoch": 1.1134020618556701,
"grad_norm": 3.1615551599035103,
"learning_rate": 4.1140450421038865e-06,
"loss": 0.1051,
"step": 756
},
{
"epoch": 1.1148748159057438,
"grad_norm": 1.8911758166555228,
"learning_rate": 4.102663470064709e-06,
"loss": 0.0598,
"step": 757
},
{
"epoch": 1.1163475699558174,
"grad_norm": 2.6027868423883183,
"learning_rate": 4.091286700392683e-06,
"loss": 0.08,
"step": 758
},
{
"epoch": 1.117820324005891,
"grad_norm": 3.3172593190538486,
"learning_rate": 4.079914793974024e-06,
"loss": 0.0932,
"step": 759
},
{
"epoch": 1.1192930780559647,
"grad_norm": 2.584014398968938,
"learning_rate": 4.068547811668918e-06,
"loss": 0.0737,
"step": 760
},
{
"epoch": 1.1207658321060383,
"grad_norm": 2.711235814730717,
"learning_rate": 4.057185814311203e-06,
"loss": 0.0761,
"step": 761
},
{
"epoch": 1.122238586156112,
"grad_norm": 3.013495034597246,
"learning_rate": 4.0458288627080325e-06,
"loss": 0.1001,
"step": 762
},
{
"epoch": 1.1237113402061856,
"grad_norm": 2.9214594992749325,
"learning_rate": 4.034477017639561e-06,
"loss": 0.0937,
"step": 763
},
{
"epoch": 1.1251840942562592,
"grad_norm": 2.570753452463597,
"learning_rate": 4.0231303398586124e-06,
"loss": 0.0892,
"step": 764
},
{
"epoch": 1.1266568483063328,
"grad_norm": 2.2587922331126657,
"learning_rate": 4.011788890090354e-06,
"loss": 0.0819,
"step": 765
},
{
"epoch": 1.1281296023564065,
"grad_norm": 2.850862254910788,
"learning_rate": 4.000452729031978e-06,
"loss": 0.0746,
"step": 766
},
{
"epoch": 1.12960235640648,
"grad_norm": 2.070636849791486,
"learning_rate": 3.98912191735237e-06,
"loss": 0.0724,
"step": 767
},
{
"epoch": 1.1310751104565537,
"grad_norm": 2.60117941900419,
"learning_rate": 3.977796515691785e-06,
"loss": 0.0748,
"step": 768
},
{
"epoch": 1.1325478645066274,
"grad_norm": 2.1648271561118144,
"learning_rate": 3.966476584661528e-06,
"loss": 0.0625,
"step": 769
},
{
"epoch": 1.134020618556701,
"grad_norm": 3.459878540617657,
"learning_rate": 3.955162184843625e-06,
"loss": 0.0995,
"step": 770
},
{
"epoch": 1.1354933726067746,
"grad_norm": 2.0253389265031645,
"learning_rate": 3.943853376790501e-06,
"loss": 0.0515,
"step": 771
},
{
"epoch": 1.1369661266568483,
"grad_norm": 2.3346165427095724,
"learning_rate": 3.932550221024651e-06,
"loss": 0.0944,
"step": 772
},
{
"epoch": 1.138438880706922,
"grad_norm": 2.4437185103047527,
"learning_rate": 3.921252778038326e-06,
"loss": 0.0784,
"step": 773
},
{
"epoch": 1.1399116347569955,
"grad_norm": 2.2346000371288697,
"learning_rate": 3.9099611082932e-06,
"loss": 0.0778,
"step": 774
},
{
"epoch": 1.1413843888070692,
"grad_norm": 3.126462530347664,
"learning_rate": 3.898675272220048e-06,
"loss": 0.0901,
"step": 775
},
{
"epoch": 1.1428571428571428,
"grad_norm": 2.6212182691393417,
"learning_rate": 3.887395330218429e-06,
"loss": 0.067,
"step": 776
},
{
"epoch": 1.1443298969072164,
"grad_norm": 2.2041859965816912,
"learning_rate": 3.8761213426563546e-06,
"loss": 0.0718,
"step": 777
},
{
"epoch": 1.14580265095729,
"grad_norm": 2.8686011497863637,
"learning_rate": 3.8648533698699695e-06,
"loss": 0.0739,
"step": 778
},
{
"epoch": 1.1472754050073637,
"grad_norm": 3.570906083127591,
"learning_rate": 3.85359147216323e-06,
"loss": 0.0645,
"step": 779
},
{
"epoch": 1.1487481590574373,
"grad_norm": 2.3026385044388946,
"learning_rate": 3.842335709807582e-06,
"loss": 0.07,
"step": 780
},
{
"epoch": 1.150220913107511,
"grad_norm": 2.3324933414823037,
"learning_rate": 3.831086143041628e-06,
"loss": 0.0701,
"step": 781
},
{
"epoch": 1.1516936671575846,
"grad_norm": 2.870788539209186,
"learning_rate": 3.819842832070822e-06,
"loss": 0.0836,
"step": 782
},
{
"epoch": 1.1531664212076582,
"grad_norm": 1.929302265810244,
"learning_rate": 3.808605837067135e-06,
"loss": 0.0697,
"step": 783
},
{
"epoch": 1.1546391752577319,
"grad_norm": 2.136539690495229,
"learning_rate": 3.7973752181687336e-06,
"loss": 0.0644,
"step": 784
},
{
"epoch": 1.1561119293078055,
"grad_norm": 2.2728146164935836,
"learning_rate": 3.786151035479664e-06,
"loss": 0.0834,
"step": 785
},
{
"epoch": 1.1575846833578791,
"grad_norm": 2.3642399727192682,
"learning_rate": 3.774933349069524e-06,
"loss": 0.0648,
"step": 786
},
{
"epoch": 1.1590574374079528,
"grad_norm": 2.6823533671482678,
"learning_rate": 3.76372221897315e-06,
"loss": 0.0951,
"step": 787
},
{
"epoch": 1.1605301914580266,
"grad_norm": 2.2935431804364366,
"learning_rate": 3.752517705190287e-06,
"loss": 0.0683,
"step": 788
},
{
"epoch": 1.1620029455081002,
"grad_norm": 2.4389409096597396,
"learning_rate": 3.741319867685268e-06,
"loss": 0.0938,
"step": 789
},
{
"epoch": 1.1634756995581739,
"grad_norm": 2.573029251842867,
"learning_rate": 3.7301287663867002e-06,
"loss": 0.076,
"step": 790
},
{
"epoch": 1.1649484536082475,
"grad_norm": 2.4248690619540736,
"learning_rate": 3.7189444611871383e-06,
"loss": 0.1034,
"step": 791
},
{
"epoch": 1.1664212076583211,
"grad_norm": 2.1949237372666386,
"learning_rate": 3.7077670119427644e-06,
"loss": 0.0776,
"step": 792
},
{
"epoch": 1.1678939617083948,
"grad_norm": 2.3811799264526234,
"learning_rate": 3.6965964784730717e-06,
"loss": 0.068,
"step": 793
},
{
"epoch": 1.1693667157584684,
"grad_norm": 2.1656743920814336,
"learning_rate": 3.68543292056054e-06,
"loss": 0.0745,
"step": 794
},
{
"epoch": 1.170839469808542,
"grad_norm": 2.5534075805210303,
"learning_rate": 3.674276397950316e-06,
"loss": 0.0721,
"step": 795
},
{
"epoch": 1.1723122238586157,
"grad_norm": 3.2077906538032117,
"learning_rate": 3.6631269703498974e-06,
"loss": 0.0979,
"step": 796
},
{
"epoch": 1.1737849779086893,
"grad_norm": 3.4097966300360545,
"learning_rate": 3.65198469742881e-06,
"loss": 0.0873,
"step": 797
},
{
"epoch": 1.175257731958763,
"grad_norm": 2.5911991361263036,
"learning_rate": 3.6408496388182857e-06,
"loss": 0.0706,
"step": 798
},
{
"epoch": 1.1767304860088366,
"grad_norm": 2.1867381102958636,
"learning_rate": 3.6297218541109537e-06,
"loss": 0.0552,
"step": 799
},
{
"epoch": 1.1782032400589102,
"grad_norm": 2.571703668717305,
"learning_rate": 3.61860140286051e-06,
"loss": 0.0658,
"step": 800
},
{
"epoch": 1.1796759941089838,
"grad_norm": 2.8609822870164665,
"learning_rate": 3.6074883445814024e-06,
"loss": 0.0871,
"step": 801
},
{
"epoch": 1.1811487481590575,
"grad_norm": 2.570340510742732,
"learning_rate": 3.596382738748516e-06,
"loss": 0.0808,
"step": 802
},
{
"epoch": 1.182621502209131,
"grad_norm": 2.773398165504677,
"learning_rate": 3.5852846447968526e-06,
"loss": 0.0572,
"step": 803
},
{
"epoch": 1.1840942562592047,
"grad_norm": 2.8682200523341845,
"learning_rate": 3.574194122121207e-06,
"loss": 0.0938,
"step": 804
},
{
"epoch": 1.1855670103092784,
"grad_norm": 2.1077416494570937,
"learning_rate": 3.5631112300758595e-06,
"loss": 0.0559,
"step": 805
},
{
"epoch": 1.187039764359352,
"grad_norm": 2.19204957528833,
"learning_rate": 3.55203602797425e-06,
"loss": 0.086,
"step": 806
},
{
"epoch": 1.1885125184094256,
"grad_norm": 2.491020520284403,
"learning_rate": 3.5409685750886624e-06,
"loss": 0.0791,
"step": 807
},
{
"epoch": 1.1899852724594993,
"grad_norm": 3.2077684809179625,
"learning_rate": 3.52990893064991e-06,
"loss": 0.1018,
"step": 808
},
{
"epoch": 1.1914580265095729,
"grad_norm": 2.4858054288275353,
"learning_rate": 3.518857153847019e-06,
"loss": 0.0642,
"step": 809
},
{
"epoch": 1.1929307805596465,
"grad_norm": 2.65749978557633,
"learning_rate": 3.5078133038269034e-06,
"loss": 0.0841,
"step": 810
},
{
"epoch": 1.1944035346097202,
"grad_norm": 2.8455934007145682,
"learning_rate": 3.4967774396940606e-06,
"loss": 0.07,
"step": 811
},
{
"epoch": 1.1958762886597938,
"grad_norm": 2.198946509408161,
"learning_rate": 3.4857496205102475e-06,
"loss": 0.0794,
"step": 812
},
{
"epoch": 1.1973490427098674,
"grad_norm": 2.3707778356340956,
"learning_rate": 3.474729905294163e-06,
"loss": 0.0525,
"step": 813
},
{
"epoch": 1.198821796759941,
"grad_norm": 3.1478887888758083,
"learning_rate": 3.463718353021138e-06,
"loss": 0.0825,
"step": 814
},
{
"epoch": 1.2002945508100147,
"grad_norm": 2.5941236487021047,
"learning_rate": 3.45271502262282e-06,
"loss": 0.0667,
"step": 815
},
{
"epoch": 1.2017673048600883,
"grad_norm": 3.1251883051921485,
"learning_rate": 3.441719972986846e-06,
"loss": 0.0715,
"step": 816
},
{
"epoch": 1.203240058910162,
"grad_norm": 2.4878882120381776,
"learning_rate": 3.430733262956544e-06,
"loss": 0.0598,
"step": 817
},
{
"epoch": 1.2047128129602356,
"grad_norm": 3.081778132428439,
"learning_rate": 3.4197549513306076e-06,
"loss": 0.0813,
"step": 818
},
{
"epoch": 1.2061855670103092,
"grad_norm": 2.9161799323233115,
"learning_rate": 3.4087850968627823e-06,
"loss": 0.0888,
"step": 819
},
{
"epoch": 1.2076583210603828,
"grad_norm": 3.0579431841201057,
"learning_rate": 3.3978237582615535e-06,
"loss": 0.0806,
"step": 820
},
{
"epoch": 1.2091310751104565,
"grad_norm": 2.4024003584699662,
"learning_rate": 3.3868709941898325e-06,
"loss": 0.0685,
"step": 821
},
{
"epoch": 1.21060382916053,
"grad_norm": 2.541898615431858,
"learning_rate": 3.37592686326464e-06,
"loss": 0.0991,
"step": 822
},
{
"epoch": 1.2120765832106037,
"grad_norm": 2.522454961599684,
"learning_rate": 3.364991424056794e-06,
"loss": 0.0693,
"step": 823
},
{
"epoch": 1.2135493372606774,
"grad_norm": 2.55384548126942,
"learning_rate": 3.3540647350905985e-06,
"loss": 0.06,
"step": 824
},
{
"epoch": 1.2150220913107512,
"grad_norm": 2.620048246050263,
"learning_rate": 3.343146854843523e-06,
"loss": 0.0829,
"step": 825
},
{
"epoch": 1.2164948453608249,
"grad_norm": 2.271484577946098,
"learning_rate": 3.3322378417458985e-06,
"loss": 0.0708,
"step": 826
},
{
"epoch": 1.2179675994108985,
"grad_norm": 2.5246193069267138,
"learning_rate": 3.3213377541805995e-06,
"loss": 0.0782,
"step": 827
},
{
"epoch": 1.2194403534609721,
"grad_norm": 2.108031142465364,
"learning_rate": 3.3104466504827327e-06,
"loss": 0.0732,
"step": 828
},
{
"epoch": 1.2209131075110458,
"grad_norm": 3.3236120150572113,
"learning_rate": 3.2995645889393278e-06,
"loss": 0.0671,
"step": 829
},
{
"epoch": 1.2223858615611194,
"grad_norm": 2.6374669794434404,
"learning_rate": 3.288691627789017e-06,
"loss": 0.0813,
"step": 830
},
{
"epoch": 1.223858615611193,
"grad_norm": 2.851817066818382,
"learning_rate": 3.277827825221733e-06,
"loss": 0.0706,
"step": 831
},
{
"epoch": 1.2253313696612667,
"grad_norm": 2.083133515619651,
"learning_rate": 3.2669732393783944e-06,
"loss": 0.0664,
"step": 832
},
{
"epoch": 1.2268041237113403,
"grad_norm": 2.5390230676594863,
"learning_rate": 3.2561279283505888e-06,
"loss": 0.0824,
"step": 833
},
{
"epoch": 1.228276877761414,
"grad_norm": 2.4416648845051707,
"learning_rate": 3.2452919501802714e-06,
"loss": 0.0754,
"step": 834
},
{
"epoch": 1.2297496318114876,
"grad_norm": 2.8801480217785156,
"learning_rate": 3.234465362859451e-06,
"loss": 0.0714,
"step": 835
},
{
"epoch": 1.2312223858615612,
"grad_norm": 2.5597515235457005,
"learning_rate": 3.223648224329872e-06,
"loss": 0.054,
"step": 836
},
{
"epoch": 1.2326951399116348,
"grad_norm": 2.4482923554226788,
"learning_rate": 3.2128405924827154e-06,
"loss": 0.0829,
"step": 837
},
{
"epoch": 1.2341678939617085,
"grad_norm": 2.856207881577955,
"learning_rate": 3.202042525158284e-06,
"loss": 0.0818,
"step": 838
},
{
"epoch": 1.235640648011782,
"grad_norm": 2.4499839989641465,
"learning_rate": 3.191254080145695e-06,
"loss": 0.0806,
"step": 839
},
{
"epoch": 1.2371134020618557,
"grad_norm": 2.273801185202566,
"learning_rate": 3.180475315182563e-06,
"loss": 0.0597,
"step": 840
},
{
"epoch": 1.2385861561119293,
"grad_norm": 2.261133708950399,
"learning_rate": 3.1697062879547014e-06,
"loss": 0.0877,
"step": 841
},
{
"epoch": 1.240058910162003,
"grad_norm": 2.2872555405628248,
"learning_rate": 3.1589470560958104e-06,
"loss": 0.0795,
"step": 842
},
{
"epoch": 1.2415316642120766,
"grad_norm": 2.295707839329834,
"learning_rate": 3.1481976771871627e-06,
"loss": 0.0636,
"step": 843
},
{
"epoch": 1.2430044182621502,
"grad_norm": 3.227716605839925,
"learning_rate": 3.1374582087573026e-06,
"loss": 0.1081,
"step": 844
},
{
"epoch": 1.2444771723122239,
"grad_norm": 2.4225054229446905,
"learning_rate": 3.1267287082817376e-06,
"loss": 0.09,
"step": 845
},
{
"epoch": 1.2459499263622975,
"grad_norm": 2.8376508149993933,
"learning_rate": 3.1160092331826235e-06,
"loss": 0.0624,
"step": 846
},
{
"epoch": 1.2474226804123711,
"grad_norm": 2.8282605439311306,
"learning_rate": 3.1052998408284664e-06,
"loss": 0.081,
"step": 847
},
{
"epoch": 1.2488954344624448,
"grad_norm": 2.947001349052833,
"learning_rate": 3.0946005885338116e-06,
"loss": 0.0871,
"step": 848
},
{
"epoch": 1.2503681885125184,
"grad_norm": 2.778754685623843,
"learning_rate": 3.083911533558934e-06,
"loss": 0.0745,
"step": 849
},
{
"epoch": 1.251840942562592,
"grad_norm": 2.384050662267025,
"learning_rate": 3.073232733109536e-06,
"loss": 0.059,
"step": 850
},
{
"epoch": 1.2533136966126657,
"grad_norm": 2.363442330975993,
"learning_rate": 3.0625642443364407e-06,
"loss": 0.0722,
"step": 851
},
{
"epoch": 1.2547864506627393,
"grad_norm": 2.4499491297154634,
"learning_rate": 3.0519061243352833e-06,
"loss": 0.0624,
"step": 852
},
{
"epoch": 1.256259204712813,
"grad_norm": 2.3949434799317624,
"learning_rate": 3.041258430146208e-06,
"loss": 0.0587,
"step": 853
},
{
"epoch": 1.2577319587628866,
"grad_norm": 2.780602647537924,
"learning_rate": 3.0306212187535653e-06,
"loss": 0.087,
"step": 854
},
{
"epoch": 1.2592047128129602,
"grad_norm": 3.1983788447860344,
"learning_rate": 3.0199945470855975e-06,
"loss": 0.1062,
"step": 855
},
{
"epoch": 1.2606774668630338,
"grad_norm": 2.3750027999673753,
"learning_rate": 3.0093784720141456e-06,
"loss": 0.0671,
"step": 856
},
{
"epoch": 1.2621502209131075,
"grad_norm": 2.2615956244581263,
"learning_rate": 2.9987730503543387e-06,
"loss": 0.0539,
"step": 857
},
{
"epoch": 1.263622974963181,
"grad_norm": 2.48990411000989,
"learning_rate": 2.988178338864289e-06,
"loss": 0.0673,
"step": 858
},
{
"epoch": 1.2650957290132547,
"grad_norm": 2.2036932772255544,
"learning_rate": 2.9775943942447915e-06,
"loss": 0.0738,
"step": 859
},
{
"epoch": 1.2665684830633284,
"grad_norm": 2.795202986209916,
"learning_rate": 2.9670212731390202e-06,
"loss": 0.0757,
"step": 860
},
{
"epoch": 1.268041237113402,
"grad_norm": 2.895103572350162,
"learning_rate": 2.9564590321322206e-06,
"loss": 0.0822,
"step": 861
},
{
"epoch": 1.2695139911634756,
"grad_norm": 2.747203222079235,
"learning_rate": 2.945907727751412e-06,
"loss": 0.0628,
"step": 862
},
{
"epoch": 1.2709867452135493,
"grad_norm": 3.0674742113404236,
"learning_rate": 2.935367416465085e-06,
"loss": 0.0809,
"step": 863
},
{
"epoch": 1.272459499263623,
"grad_norm": 2.3588856890649574,
"learning_rate": 2.924838154682893e-06,
"loss": 0.053,
"step": 864
},
{
"epoch": 1.2739322533136965,
"grad_norm": 2.261318532560985,
"learning_rate": 2.9143199987553574e-06,
"loss": 0.0806,
"step": 865
},
{
"epoch": 1.2754050073637702,
"grad_norm": 3.3505626197518934,
"learning_rate": 2.9038130049735634e-06,
"loss": 0.067,
"step": 866
},
{
"epoch": 1.2768777614138438,
"grad_norm": 2.3336129262324685,
"learning_rate": 2.8933172295688576e-06,
"loss": 0.0575,
"step": 867
},
{
"epoch": 1.2783505154639174,
"grad_norm": 2.40059539734755,
"learning_rate": 2.882832728712551e-06,
"loss": 0.0771,
"step": 868
},
{
"epoch": 1.279823269513991,
"grad_norm": 2.7561605556229343,
"learning_rate": 2.8723595585156083e-06,
"loss": 0.08,
"step": 869
},
{
"epoch": 1.2812960235640647,
"grad_norm": 2.547320563573267,
"learning_rate": 2.8618977750283605e-06,
"loss": 0.051,
"step": 870
},
{
"epoch": 1.2827687776141383,
"grad_norm": 1.9709609992186161,
"learning_rate": 2.8514474342402006e-06,
"loss": 0.0631,
"step": 871
},
{
"epoch": 1.284241531664212,
"grad_norm": 2.9809387595001047,
"learning_rate": 2.841008592079281e-06,
"loss": 0.0789,
"step": 872
},
{
"epoch": 1.2857142857142856,
"grad_norm": 2.86008312832222,
"learning_rate": 2.83058130441221e-06,
"loss": 0.0888,
"step": 873
},
{
"epoch": 1.2871870397643592,
"grad_norm": 2.4796550101557937,
"learning_rate": 2.8201656270437662e-06,
"loss": 0.066,
"step": 874
},
{
"epoch": 1.2886597938144329,
"grad_norm": 2.6030800209190668,
"learning_rate": 2.8097616157165886e-06,
"loss": 0.0662,
"step": 875
},
{
"epoch": 1.2901325478645067,
"grad_norm": 2.553858828851561,
"learning_rate": 2.7993693261108823e-06,
"loss": 0.07,
"step": 876
},
{
"epoch": 1.2916053019145803,
"grad_norm": 2.1429985168409353,
"learning_rate": 2.788988813844121e-06,
"loss": 0.0547,
"step": 877
},
{
"epoch": 1.293078055964654,
"grad_norm": 2.306635027448473,
"learning_rate": 2.7786201344707487e-06,
"loss": 0.0802,
"step": 878
},
{
"epoch": 1.2945508100147276,
"grad_norm": 2.248947781614172,
"learning_rate": 2.768263343481881e-06,
"loss": 0.08,
"step": 879
},
{
"epoch": 1.2960235640648012,
"grad_norm": 2.958537556220352,
"learning_rate": 2.7579184963050056e-06,
"loss": 0.0664,
"step": 880
},
{
"epoch": 1.2974963181148749,
"grad_norm": 2.4922414343913664,
"learning_rate": 2.7475856483036967e-06,
"loss": 0.0677,
"step": 881
},
{
"epoch": 1.2989690721649485,
"grad_norm": 2.562197068557909,
"learning_rate": 2.7372648547773063e-06,
"loss": 0.0728,
"step": 882
},
{
"epoch": 1.3004418262150221,
"grad_norm": 2.1123483743229117,
"learning_rate": 2.726956170960674e-06,
"loss": 0.0579,
"step": 883
},
{
"epoch": 1.3019145802650958,
"grad_norm": 2.7651251279632363,
"learning_rate": 2.716659652023833e-06,
"loss": 0.0786,
"step": 884
},
{
"epoch": 1.3033873343151694,
"grad_norm": 3.0311493031355665,
"learning_rate": 2.706375353071712e-06,
"loss": 0.0729,
"step": 885
},
{
"epoch": 1.304860088365243,
"grad_norm": 2.6379196457688505,
"learning_rate": 2.6961033291438343e-06,
"loss": 0.0882,
"step": 886
},
{
"epoch": 1.3063328424153167,
"grad_norm": 2.3533767366059606,
"learning_rate": 2.685843635214038e-06,
"loss": 0.079,
"step": 887
},
{
"epoch": 1.3078055964653903,
"grad_norm": 2.4437114285251904,
"learning_rate": 2.6755963261901706e-06,
"loss": 0.0612,
"step": 888
},
{
"epoch": 1.309278350515464,
"grad_norm": 2.7288761273981263,
"learning_rate": 2.665361456913797e-06,
"loss": 0.0987,
"step": 889
},
{
"epoch": 1.3107511045655376,
"grad_norm": 2.0952523055264165,
"learning_rate": 2.655139082159908e-06,
"loss": 0.0561,
"step": 890
},
{
"epoch": 1.3122238586156112,
"grad_norm": 2.769276897165276,
"learning_rate": 2.644929256636628e-06,
"loss": 0.0963,
"step": 891
},
{
"epoch": 1.3136966126656848,
"grad_norm": 2.2960722586747524,
"learning_rate": 2.634732034984915e-06,
"loss": 0.0568,
"step": 892
},
{
"epoch": 1.3151693667157585,
"grad_norm": 2.787153179831851,
"learning_rate": 2.624547471778278e-06,
"loss": 0.0901,
"step": 893
},
{
"epoch": 1.316642120765832,
"grad_norm": 2.861891095875749,
"learning_rate": 2.6143756215224803e-06,
"loss": 0.1107,
"step": 894
},
{
"epoch": 1.3181148748159057,
"grad_norm": 2.2294971099864336,
"learning_rate": 2.604216538655247e-06,
"loss": 0.0791,
"step": 895
},
{
"epoch": 1.3195876288659794,
"grad_norm": 2.3877245266570846,
"learning_rate": 2.594070277545975e-06,
"loss": 0.0725,
"step": 896
},
{
"epoch": 1.321060382916053,
"grad_norm": 2.806049349126312,
"learning_rate": 2.5839368924954435e-06,
"loss": 0.1136,
"step": 897
},
{
"epoch": 1.3225331369661266,
"grad_norm": 2.8513599503802087,
"learning_rate": 2.5738164377355148e-06,
"loss": 0.0776,
"step": 898
},
{
"epoch": 1.3240058910162003,
"grad_norm": 3.1114741400011074,
"learning_rate": 2.563708967428859e-06,
"loss": 0.0658,
"step": 899
},
{
"epoch": 1.3254786450662739,
"grad_norm": 2.2743662613163522,
"learning_rate": 2.5536145356686528e-06,
"loss": 0.0572,
"step": 900
},
{
"epoch": 1.3269513991163475,
"grad_norm": 2.268767817457225,
"learning_rate": 2.5435331964782916e-06,
"loss": 0.0746,
"step": 901
},
{
"epoch": 1.3284241531664212,
"grad_norm": 1.9907870732319457,
"learning_rate": 2.5334650038111045e-06,
"loss": 0.0504,
"step": 902
},
{
"epoch": 1.3298969072164948,
"grad_norm": 2.3485970715966764,
"learning_rate": 2.5234100115500643e-06,
"loss": 0.054,
"step": 903
},
{
"epoch": 1.3313696612665684,
"grad_norm": 2.7559810357796413,
"learning_rate": 2.5133682735074904e-06,
"loss": 0.0696,
"step": 904
},
{
"epoch": 1.332842415316642,
"grad_norm": 2.833175599971209,
"learning_rate": 2.503339843424777e-06,
"loss": 0.0623,
"step": 905
},
{
"epoch": 1.3343151693667157,
"grad_norm": 2.0828464328941103,
"learning_rate": 2.4933247749720912e-06,
"loss": 0.0584,
"step": 906
},
{
"epoch": 1.3357879234167893,
"grad_norm": 2.4533237420565115,
"learning_rate": 2.483323121748094e-06,
"loss": 0.0693,
"step": 907
},
{
"epoch": 1.3372606774668632,
"grad_norm": 2.646584000152378,
"learning_rate": 2.4733349372796506e-06,
"loss": 0.086,
"step": 908
},
{
"epoch": 1.3387334315169368,
"grad_norm": 2.2230708948262246,
"learning_rate": 2.4633602750215447e-06,
"loss": 0.0728,
"step": 909
},
{
"epoch": 1.3402061855670104,
"grad_norm": 2.810498628664279,
"learning_rate": 2.4533991883561868e-06,
"loss": 0.0873,
"step": 910
},
{
"epoch": 1.341678939617084,
"grad_norm": 2.248467777163056,
"learning_rate": 2.4434517305933394e-06,
"loss": 0.0672,
"step": 911
},
{
"epoch": 1.3431516936671577,
"grad_norm": 2.1367635036683184,
"learning_rate": 2.4335179549698233e-06,
"loss": 0.0499,
"step": 912
},
{
"epoch": 1.3446244477172313,
"grad_norm": 3.2725373522512142,
"learning_rate": 2.423597914649234e-06,
"loss": 0.1078,
"step": 913
},
{
"epoch": 1.346097201767305,
"grad_norm": 2.867491627460914,
"learning_rate": 2.4136916627216656e-06,
"loss": 0.0797,
"step": 914
},
{
"epoch": 1.3475699558173786,
"grad_norm": 2.094438599376492,
"learning_rate": 2.403799252203408e-06,
"loss": 0.0744,
"step": 915
},
{
"epoch": 1.3490427098674522,
"grad_norm": 2.8403655449439413,
"learning_rate": 2.393920736036683e-06,
"loss": 0.0636,
"step": 916
},
{
"epoch": 1.3505154639175259,
"grad_norm": 2.2641504729859117,
"learning_rate": 2.38405616708935e-06,
"loss": 0.0759,
"step": 917
},
{
"epoch": 1.3519882179675995,
"grad_norm": 2.6990721664555015,
"learning_rate": 2.374205598154624e-06,
"loss": 0.0715,
"step": 918
},
{
"epoch": 1.3534609720176731,
"grad_norm": 2.0450429212053645,
"learning_rate": 2.3643690819507984e-06,
"loss": 0.0687,
"step": 919
},
{
"epoch": 1.3549337260677468,
"grad_norm": 2.815497301963677,
"learning_rate": 2.3545466711209585e-06,
"loss": 0.0751,
"step": 920
},
{
"epoch": 1.3564064801178204,
"grad_norm": 2.0833620917161517,
"learning_rate": 2.3447384182326948e-06,
"loss": 0.0606,
"step": 921
},
{
"epoch": 1.357879234167894,
"grad_norm": 2.5027749716472334,
"learning_rate": 2.3349443757778346e-06,
"loss": 0.09,
"step": 922
},
{
"epoch": 1.3593519882179677,
"grad_norm": 2.381133387963809,
"learning_rate": 2.3251645961721494e-06,
"loss": 0.0671,
"step": 923
},
{
"epoch": 1.3608247422680413,
"grad_norm": 2.9284274100164565,
"learning_rate": 2.315399131755081e-06,
"loss": 0.0831,
"step": 924
},
{
"epoch": 1.362297496318115,
"grad_norm": 2.242298453649294,
"learning_rate": 2.3056480347894584e-06,
"loss": 0.0617,
"step": 925
},
{
"epoch": 1.3637702503681886,
"grad_norm": 3.411313418432218,
"learning_rate": 2.2959113574612204e-06,
"loss": 0.1254,
"step": 926
},
{
"epoch": 1.3652430044182622,
"grad_norm": 2.2293073196874853,
"learning_rate": 2.2861891518791287e-06,
"loss": 0.0481,
"step": 927
},
{
"epoch": 1.3667157584683358,
"grad_norm": 2.1743245071883273,
"learning_rate": 2.2764814700745025e-06,
"loss": 0.068,
"step": 928
},
{
"epoch": 1.3681885125184094,
"grad_norm": 2.373291429434674,
"learning_rate": 2.266788364000929e-06,
"loss": 0.0452,
"step": 929
},
{
"epoch": 1.369661266568483,
"grad_norm": 2.597864530271633,
"learning_rate": 2.25710988553399e-06,
"loss": 0.0878,
"step": 930
},
{
"epoch": 1.3711340206185567,
"grad_norm": 3.034163746055045,
"learning_rate": 2.2474460864709825e-06,
"loss": 0.0779,
"step": 931
},
{
"epoch": 1.3726067746686303,
"grad_norm": 3.5301260309240057,
"learning_rate": 2.2377970185306424e-06,
"loss": 0.097,
"step": 932
},
{
"epoch": 1.374079528718704,
"grad_norm": 2.4514313697898213,
"learning_rate": 2.22816273335287e-06,
"loss": 0.0548,
"step": 933
},
{
"epoch": 1.3755522827687776,
"grad_norm": 3.2426641042269413,
"learning_rate": 2.2185432824984455e-06,
"loss": 0.0735,
"step": 934
},
{
"epoch": 1.3770250368188512,
"grad_norm": 2.2441096467851036,
"learning_rate": 2.208938717448763e-06,
"loss": 0.0587,
"step": 935
},
{
"epoch": 1.3784977908689249,
"grad_norm": 2.1614600826394508,
"learning_rate": 2.1993490896055514e-06,
"loss": 0.0593,
"step": 936
},
{
"epoch": 1.3799705449189985,
"grad_norm": 2.246163741793099,
"learning_rate": 2.1897744502905955e-06,
"loss": 0.0617,
"step": 937
},
{
"epoch": 1.3814432989690721,
"grad_norm": 2.222744117217539,
"learning_rate": 2.1802148507454675e-06,
"loss": 0.069,
"step": 938
},
{
"epoch": 1.3829160530191458,
"grad_norm": 2.2021966299628555,
"learning_rate": 2.170670342131249e-06,
"loss": 0.0747,
"step": 939
},
{
"epoch": 1.3843888070692194,
"grad_norm": 2.6432292171263985,
"learning_rate": 2.1611409755282542e-06,
"loss": 0.0889,
"step": 940
},
{
"epoch": 1.385861561119293,
"grad_norm": 2.3868568237992696,
"learning_rate": 2.1516268019357656e-06,
"loss": 0.0928,
"step": 941
},
{
"epoch": 1.3873343151693667,
"grad_norm": 2.1533933661208984,
"learning_rate": 2.1421278722717524e-06,
"loss": 0.0482,
"step": 942
},
{
"epoch": 1.3888070692194403,
"grad_norm": 2.478893589441798,
"learning_rate": 2.132644237372603e-06,
"loss": 0.0749,
"step": 943
},
{
"epoch": 1.390279823269514,
"grad_norm": 2.4965674075650814,
"learning_rate": 2.123175947992851e-06,
"loss": 0.0617,
"step": 944
},
{
"epoch": 1.3917525773195876,
"grad_norm": 2.5896583305139353,
"learning_rate": 2.1137230548049042e-06,
"loss": 0.0783,
"step": 945
},
{
"epoch": 1.3932253313696612,
"grad_norm": 2.5229928189159163,
"learning_rate": 2.1042856083987694e-06,
"loss": 0.0721,
"step": 946
},
{
"epoch": 1.3946980854197348,
"grad_norm": 2.620841054032406,
"learning_rate": 2.09486365928179e-06,
"loss": 0.0478,
"step": 947
},
{
"epoch": 1.3961708394698085,
"grad_norm": 2.4635557419644627,
"learning_rate": 2.085457257878369e-06,
"loss": 0.0665,
"step": 948
},
{
"epoch": 1.397643593519882,
"grad_norm": 2.491834759671326,
"learning_rate": 2.076066454529701e-06,
"loss": 0.0885,
"step": 949
},
{
"epoch": 1.3991163475699557,
"grad_norm": 2.1247321015069893,
"learning_rate": 2.0666912994935034e-06,
"loss": 0.0486,
"step": 950
},
{
"epoch": 1.4005891016200294,
"grad_norm": 2.0938641025252127,
"learning_rate": 2.0573318429437487e-06,
"loss": 0.0588,
"step": 951
},
{
"epoch": 1.402061855670103,
"grad_norm": 2.994410693825716,
"learning_rate": 2.0479881349703885e-06,
"loss": 0.0778,
"step": 952
},
{
"epoch": 1.4035346097201766,
"grad_norm": 2.6687029575886516,
"learning_rate": 2.038660225579096e-06,
"loss": 0.0671,
"step": 953
},
{
"epoch": 1.4050073637702503,
"grad_norm": 2.5657403990694547,
"learning_rate": 2.0293481646909934e-06,
"loss": 0.0703,
"step": 954
},
{
"epoch": 1.406480117820324,
"grad_norm": 2.7104348627571153,
"learning_rate": 2.0200520021423813e-06,
"loss": 0.0752,
"step": 955
},
{
"epoch": 1.4079528718703975,
"grad_norm": 2.792438454224809,
"learning_rate": 2.010771787684484e-06,
"loss": 0.0747,
"step": 956
},
{
"epoch": 1.4094256259204712,
"grad_norm": 3.00295539481228,
"learning_rate": 2.0015075709831634e-06,
"loss": 0.0778,
"step": 957
},
{
"epoch": 1.4108983799705448,
"grad_norm": 2.7396339188862617,
"learning_rate": 1.9922594016186716e-06,
"loss": 0.0748,
"step": 958
},
{
"epoch": 1.4123711340206184,
"grad_norm": 2.2053073159941095,
"learning_rate": 1.983027329085377e-06,
"loss": 0.0539,
"step": 959
},
{
"epoch": 1.413843888070692,
"grad_norm": 2.534983064810876,
"learning_rate": 1.9738114027915007e-06,
"loss": 0.0938,
"step": 960
},
{
"epoch": 1.415316642120766,
"grad_norm": 2.4967187142059393,
"learning_rate": 1.9646116720588525e-06,
"loss": 0.0656,
"step": 961
},
{
"epoch": 1.4167893961708395,
"grad_norm": 2.066492663646761,
"learning_rate": 1.9554281861225694e-06,
"loss": 0.0424,
"step": 962
},
{
"epoch": 1.4182621502209132,
"grad_norm": 2.771730400835999,
"learning_rate": 1.946260994130843e-06,
"loss": 0.0573,
"step": 963
},
{
"epoch": 1.4197349042709868,
"grad_norm": 2.8791465865837536,
"learning_rate": 1.9371101451446685e-06,
"loss": 0.0859,
"step": 964
},
{
"epoch": 1.4212076583210604,
"grad_norm": 3.2248308136532047,
"learning_rate": 1.9279756881375746e-06,
"loss": 0.0805,
"step": 965
},
{
"epoch": 1.422680412371134,
"grad_norm": 2.8777696252816516,
"learning_rate": 1.9188576719953635e-06,
"loss": 0.0769,
"step": 966
},
{
"epoch": 1.4241531664212077,
"grad_norm": 2.5130714611958704,
"learning_rate": 1.9097561455158502e-06,
"loss": 0.0812,
"step": 967
},
{
"epoch": 1.4256259204712813,
"grad_norm": 2.531692326473217,
"learning_rate": 1.9006711574086006e-06,
"loss": 0.065,
"step": 968
},
{
"epoch": 1.427098674521355,
"grad_norm": 2.376440428249521,
"learning_rate": 1.8916027562946659e-06,
"loss": 0.063,
"step": 969
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.1528998094878276,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.0688,
"step": 970
},
{
"epoch": 1.4300441826215022,
"grad_norm": 2.530258164073908,
"learning_rate": 1.873515909086855e-06,
"loss": 0.0647,
"step": 971
},
{
"epoch": 1.4315169366715759,
"grad_norm": 2.663885408371276,
"learning_rate": 1.8644975597901977e-06,
"loss": 0.0633,
"step": 972
},
{
"epoch": 1.4329896907216495,
"grad_norm": 2.608204191111035,
"learning_rate": 1.8554959910807773e-06,
"loss": 0.0487,
"step": 973
},
{
"epoch": 1.4344624447717231,
"grad_norm": 2.5581762253885616,
"learning_rate": 1.8465112511332068e-06,
"loss": 0.0654,
"step": 974
},
{
"epoch": 1.4359351988217968,
"grad_norm": 2.1927550988613027,
"learning_rate": 1.8375433880320293e-06,
"loss": 0.0603,
"step": 975
},
{
"epoch": 1.4374079528718704,
"grad_norm": 2.5356027344331804,
"learning_rate": 1.8285924497714702e-06,
"loss": 0.0843,
"step": 976
},
{
"epoch": 1.438880706921944,
"grad_norm": 2.126438287431127,
"learning_rate": 1.8196584842551772e-06,
"loss": 0.0704,
"step": 977
},
{
"epoch": 1.4403534609720177,
"grad_norm": 2.4680758427563054,
"learning_rate": 1.8107415392959615e-06,
"loss": 0.0481,
"step": 978
},
{
"epoch": 1.4418262150220913,
"grad_norm": 2.4116108572599155,
"learning_rate": 1.8018416626155443e-06,
"loss": 0.0457,
"step": 979
},
{
"epoch": 1.443298969072165,
"grad_norm": 2.475505185016457,
"learning_rate": 1.7929589018443016e-06,
"loss": 0.0838,
"step": 980
},
{
"epoch": 1.4447717231222386,
"grad_norm": 3.190060647366546,
"learning_rate": 1.7840933045210052e-06,
"loss": 0.0826,
"step": 981
},
{
"epoch": 1.4462444771723122,
"grad_norm": 1.9808370523468428,
"learning_rate": 1.7752449180925746e-06,
"loss": 0.0588,
"step": 982
},
{
"epoch": 1.4477172312223858,
"grad_norm": 3.0854014410212383,
"learning_rate": 1.7664137899138195e-06,
"loss": 0.0621,
"step": 983
},
{
"epoch": 1.4491899852724595,
"grad_norm": 2.0882868636972116,
"learning_rate": 1.7575999672471866e-06,
"loss": 0.0505,
"step": 984
},
{
"epoch": 1.450662739322533,
"grad_norm": 2.444569753698376,
"learning_rate": 1.7488034972625067e-06,
"loss": 0.0704,
"step": 985
},
{
"epoch": 1.4521354933726067,
"grad_norm": 2.2628946150746154,
"learning_rate": 1.7400244270367429e-06,
"loss": 0.0722,
"step": 986
},
{
"epoch": 1.4536082474226804,
"grad_norm": 2.732925846230406,
"learning_rate": 1.7312628035537388e-06,
"loss": 0.0611,
"step": 987
},
{
"epoch": 1.455081001472754,
"grad_norm": 2.4632091259565057,
"learning_rate": 1.7225186737039639e-06,
"loss": 0.0819,
"step": 988
},
{
"epoch": 1.4565537555228276,
"grad_norm": 2.989212148661273,
"learning_rate": 1.7137920842842675e-06,
"loss": 0.0861,
"step": 989
},
{
"epoch": 1.4580265095729013,
"grad_norm": 2.221628195006125,
"learning_rate": 1.7050830819976266e-06,
"loss": 0.0375,
"step": 990
},
{
"epoch": 1.4594992636229749,
"grad_norm": 2.4962489534632533,
"learning_rate": 1.696391713452893e-06,
"loss": 0.0953,
"step": 991
},
{
"epoch": 1.4609720176730487,
"grad_norm": 2.428793575695664,
"learning_rate": 1.6877180251645487e-06,
"loss": 0.071,
"step": 992
},
{
"epoch": 1.4624447717231224,
"grad_norm": 2.171999391444608,
"learning_rate": 1.679062063552454e-06,
"loss": 0.057,
"step": 993
},
{
"epoch": 1.463917525773196,
"grad_norm": 2.7695998022109984,
"learning_rate": 1.6704238749415958e-06,
"loss": 0.0772,
"step": 994
},
{
"epoch": 1.4653902798232696,
"grad_norm": 2.821736257619605,
"learning_rate": 1.6618035055618486e-06,
"loss": 0.0631,
"step": 995
},
{
"epoch": 1.4668630338733433,
"grad_norm": 1.8315536946657645,
"learning_rate": 1.653201001547719e-06,
"loss": 0.0376,
"step": 996
},
{
"epoch": 1.468335787923417,
"grad_norm": 2.7605414365927627,
"learning_rate": 1.6446164089381033e-06,
"loss": 0.0802,
"step": 997
},
{
"epoch": 1.4698085419734905,
"grad_norm": 2.373185594157067,
"learning_rate": 1.6360497736760383e-06,
"loss": 0.073,
"step": 998
},
{
"epoch": 1.4712812960235642,
"grad_norm": 2.7047012920858267,
"learning_rate": 1.6275011416084563e-06,
"loss": 0.0698,
"step": 999
},
{
"epoch": 1.4727540500736378,
"grad_norm": 2.6421087343127705,
"learning_rate": 1.6189705584859422e-06,
"loss": 0.0636,
"step": 1000
},
{
"epoch": 1.4727540500736378,
"eval_loss": 0.1951112598180771,
"eval_runtime": 1.2906,
"eval_samples_per_second": 42.615,
"eval_steps_per_second": 10.847,
"step": 1000
},
{
"epoch": 1.4742268041237114,
"grad_norm": 2.6145048863030906,
"learning_rate": 1.6104580699624839e-06,
"loss": 0.0749,
"step": 1001
},
{
"epoch": 1.475699558173785,
"grad_norm": 2.0303418482498055,
"learning_rate": 1.6019637215952322e-06,
"loss": 0.042,
"step": 1002
},
{
"epoch": 1.4771723122238587,
"grad_norm": 2.7386176337259247,
"learning_rate": 1.593487558844255e-06,
"loss": 0.0725,
"step": 1003
},
{
"epoch": 1.4786450662739323,
"grad_norm": 3.0396736461273757,
"learning_rate": 1.5850296270722965e-06,
"loss": 0.0674,
"step": 1004
},
{
"epoch": 1.480117820324006,
"grad_norm": 3.3616222085844605,
"learning_rate": 1.576589971544526e-06,
"loss": 0.0984,
"step": 1005
},
{
"epoch": 1.4815905743740796,
"grad_norm": 2.375868239069935,
"learning_rate": 1.568168637428309e-06,
"loss": 0.0615,
"step": 1006
},
{
"epoch": 1.4830633284241532,
"grad_norm": 2.6401883677901425,
"learning_rate": 1.559765669792954e-06,
"loss": 0.0653,
"step": 1007
},
{
"epoch": 1.4845360824742269,
"grad_norm": 1.9618914207468363,
"learning_rate": 1.5513811136094786e-06,
"loss": 0.0676,
"step": 1008
},
{
"epoch": 1.4860088365243005,
"grad_norm": 2.357533639079111,
"learning_rate": 1.543015013750364e-06,
"loss": 0.0622,
"step": 1009
},
{
"epoch": 1.4874815905743741,
"grad_norm": 2.7396984560438122,
"learning_rate": 1.5346674149893204e-06,
"loss": 0.0723,
"step": 1010
},
{
"epoch": 1.4889543446244478,
"grad_norm": 1.8910283131839978,
"learning_rate": 1.5263383620010359e-06,
"loss": 0.0629,
"step": 1011
},
{
"epoch": 1.4904270986745214,
"grad_norm": 2.6877543066986322,
"learning_rate": 1.5180278993609527e-06,
"loss": 0.0628,
"step": 1012
},
{
"epoch": 1.491899852724595,
"grad_norm": 2.2672309433321973,
"learning_rate": 1.5097360715450187e-06,
"loss": 0.0648,
"step": 1013
},
{
"epoch": 1.4933726067746687,
"grad_norm": 2.4608356536595952,
"learning_rate": 1.5014629229294525e-06,
"loss": 0.0826,
"step": 1014
},
{
"epoch": 1.4948453608247423,
"grad_norm": 2.359754445643814,
"learning_rate": 1.4932084977905043e-06,
"loss": 0.0471,
"step": 1015
},
{
"epoch": 1.496318114874816,
"grad_norm": 2.027289584732701,
"learning_rate": 1.4849728403042213e-06,
"loss": 0.0561,
"step": 1016
},
{
"epoch": 1.4977908689248896,
"grad_norm": 2.464689524167278,
"learning_rate": 1.4767559945462073e-06,
"loss": 0.0604,
"step": 1017
},
{
"epoch": 1.4992636229749632,
"grad_norm": 2.1349837626514834,
"learning_rate": 1.4685580044913921e-06,
"loss": 0.0627,
"step": 1018
},
{
"epoch": 1.5007363770250368,
"grad_norm": 2.5480519428195256,
"learning_rate": 1.460378914013793e-06,
"loss": 0.0745,
"step": 1019
},
{
"epoch": 1.5022091310751104,
"grad_norm": 2.4685840222085567,
"learning_rate": 1.4522187668862797e-06,
"loss": 0.0508,
"step": 1020
},
{
"epoch": 1.503681885125184,
"grad_norm": 2.384796029124157,
"learning_rate": 1.444077606780342e-06,
"loss": 0.0815,
"step": 1021
},
{
"epoch": 1.5051546391752577,
"grad_norm": 3.0184805492577143,
"learning_rate": 1.4359554772658551e-06,
"loss": 0.0629,
"step": 1022
},
{
"epoch": 1.5066273932253313,
"grad_norm": 2.5109695121506874,
"learning_rate": 1.4278524218108424e-06,
"loss": 0.061,
"step": 1023
},
{
"epoch": 1.508100147275405,
"grad_norm": 2.076656088861386,
"learning_rate": 1.419768483781252e-06,
"loss": 0.0679,
"step": 1024
},
{
"epoch": 1.5095729013254786,
"grad_norm": 2.587031340103411,
"learning_rate": 1.4117037064407164e-06,
"loss": 0.062,
"step": 1025
},
{
"epoch": 1.5110456553755522,
"grad_norm": 2.969930237937353,
"learning_rate": 1.4036581329503245e-06,
"loss": 0.0715,
"step": 1026
},
{
"epoch": 1.5125184094256259,
"grad_norm": 2.148007992080645,
"learning_rate": 1.3956318063683905e-06,
"loss": 0.0567,
"step": 1027
},
{
"epoch": 1.5139911634756995,
"grad_norm": 2.217179463177697,
"learning_rate": 1.3876247696502238e-06,
"loss": 0.0553,
"step": 1028
},
{
"epoch": 1.5154639175257731,
"grad_norm": 2.7466217897393084,
"learning_rate": 1.3796370656478936e-06,
"loss": 0.0487,
"step": 1029
},
{
"epoch": 1.5169366715758468,
"grad_norm": 2.346631548903953,
"learning_rate": 1.3716687371100096e-06,
"loss": 0.0581,
"step": 1030
},
{
"epoch": 1.5184094256259204,
"grad_norm": 2.8877452342792997,
"learning_rate": 1.363719826681486e-06,
"loss": 0.0841,
"step": 1031
},
{
"epoch": 1.519882179675994,
"grad_norm": 2.5602367543535633,
"learning_rate": 1.355790376903315e-06,
"loss": 0.0825,
"step": 1032
},
{
"epoch": 1.5213549337260677,
"grad_norm": 2.707697935594709,
"learning_rate": 1.3478804302123382e-06,
"loss": 0.0661,
"step": 1033
},
{
"epoch": 1.5228276877761413,
"grad_norm": 2.3215844194395316,
"learning_rate": 1.3399900289410245e-06,
"loss": 0.0714,
"step": 1034
},
{
"epoch": 1.524300441826215,
"grad_norm": 2.260662446674949,
"learning_rate": 1.332119215317233e-06,
"loss": 0.0463,
"step": 1035
},
{
"epoch": 1.5257731958762886,
"grad_norm": 2.399278959045141,
"learning_rate": 1.3242680314639995e-06,
"loss": 0.0561,
"step": 1036
},
{
"epoch": 1.5272459499263622,
"grad_norm": 2.4713132106462847,
"learning_rate": 1.3164365193993017e-06,
"loss": 0.0678,
"step": 1037
},
{
"epoch": 1.5287187039764358,
"grad_norm": 2.066649377770113,
"learning_rate": 1.30862472103584e-06,
"loss": 0.0568,
"step": 1038
},
{
"epoch": 1.5301914580265095,
"grad_norm": 2.853499333421734,
"learning_rate": 1.30083267818081e-06,
"loss": 0.0683,
"step": 1039
},
{
"epoch": 1.531664212076583,
"grad_norm": 2.623260806713472,
"learning_rate": 1.2930604325356793e-06,
"loss": 0.069,
"step": 1040
},
{
"epoch": 1.5331369661266567,
"grad_norm": 2.694604045990694,
"learning_rate": 1.2853080256959655e-06,
"loss": 0.0801,
"step": 1041
},
{
"epoch": 1.5346097201767304,
"grad_norm": 2.7935173842568926,
"learning_rate": 1.277575499151013e-06,
"loss": 0.0883,
"step": 1042
},
{
"epoch": 1.536082474226804,
"grad_norm": 2.9683928203439613,
"learning_rate": 1.2698628942837698e-06,
"loss": 0.0793,
"step": 1043
},
{
"epoch": 1.5375552282768776,
"grad_norm": 2.5002622810382085,
"learning_rate": 1.2621702523705676e-06,
"loss": 0.0642,
"step": 1044
},
{
"epoch": 1.5390279823269513,
"grad_norm": 2.0435391117373976,
"learning_rate": 1.2544976145809018e-06,
"loss": 0.0595,
"step": 1045
},
{
"epoch": 1.540500736377025,
"grad_norm": 2.8188490521434906,
"learning_rate": 1.2468450219772054e-06,
"loss": 0.0782,
"step": 1046
},
{
"epoch": 1.5419734904270985,
"grad_norm": 1.9740548073880617,
"learning_rate": 1.2392125155146385e-06,
"loss": 0.0552,
"step": 1047
},
{
"epoch": 1.5434462444771722,
"grad_norm": 2.6523564783225924,
"learning_rate": 1.2316001360408614e-06,
"loss": 0.0723,
"step": 1048
},
{
"epoch": 1.5449189985272458,
"grad_norm": 2.568676844998587,
"learning_rate": 1.224007924295819e-06,
"loss": 0.0788,
"step": 1049
},
{
"epoch": 1.5463917525773194,
"grad_norm": 2.2720349996304616,
"learning_rate": 1.2164359209115235e-06,
"loss": 0.0491,
"step": 1050
},
{
"epoch": 1.5478645066273933,
"grad_norm": 2.620055235482202,
"learning_rate": 1.2088841664118367e-06,
"loss": 0.1146,
"step": 1051
},
{
"epoch": 1.549337260677467,
"grad_norm": 2.6188369554058464,
"learning_rate": 1.2013527012122477e-06,
"loss": 0.0867,
"step": 1052
},
{
"epoch": 1.5508100147275405,
"grad_norm": 3.2625473467121715,
"learning_rate": 1.1938415656196673e-06,
"loss": 0.0802,
"step": 1053
},
{
"epoch": 1.5522827687776142,
"grad_norm": 2.1439253167381938,
"learning_rate": 1.1863507998322022e-06,
"loss": 0.0593,
"step": 1054
},
{
"epoch": 1.5537555228276878,
"grad_norm": 2.7156469251806756,
"learning_rate": 1.1788804439389457e-06,
"loss": 0.0782,
"step": 1055
},
{
"epoch": 1.5552282768777614,
"grad_norm": 2.6537677951105683,
"learning_rate": 1.1714305379197616e-06,
"loss": 0.0688,
"step": 1056
},
{
"epoch": 1.556701030927835,
"grad_norm": 2.5608627638328536,
"learning_rate": 1.164001121645069e-06,
"loss": 0.0938,
"step": 1057
},
{
"epoch": 1.5581737849779087,
"grad_norm": 2.926057413992728,
"learning_rate": 1.1565922348756324e-06,
"loss": 0.0855,
"step": 1058
},
{
"epoch": 1.5596465390279823,
"grad_norm": 1.9123934797486226,
"learning_rate": 1.149203917262341e-06,
"loss": 0.0407,
"step": 1059
},
{
"epoch": 1.561119293078056,
"grad_norm": 3.1997095246827523,
"learning_rate": 1.1418362083460067e-06,
"loss": 0.0745,
"step": 1060
},
{
"epoch": 1.5625920471281296,
"grad_norm": 2.872152786504894,
"learning_rate": 1.1344891475571474e-06,
"loss": 0.0725,
"step": 1061
},
{
"epoch": 1.5640648011782032,
"grad_norm": 2.505626890725191,
"learning_rate": 1.1271627742157743e-06,
"loss": 0.0864,
"step": 1062
},
{
"epoch": 1.5655375552282769,
"grad_norm": 2.635180660318519,
"learning_rate": 1.1198571275311859e-06,
"loss": 0.0806,
"step": 1063
},
{
"epoch": 1.5670103092783505,
"grad_norm": 2.2183669641526853,
"learning_rate": 1.1125722466017547e-06,
"loss": 0.0548,
"step": 1064
},
{
"epoch": 1.5684830633284241,
"grad_norm": 3.0342759269799955,
"learning_rate": 1.1053081704147162e-06,
"loss": 0.0725,
"step": 1065
},
{
"epoch": 1.5699558173784978,
"grad_norm": 2.5683330192524165,
"learning_rate": 1.0980649378459668e-06,
"loss": 0.0655,
"step": 1066
},
{
"epoch": 1.5714285714285714,
"grad_norm": 2.6764766220734577,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.0399,
"step": 1067
},
{
"epoch": 1.572901325478645,
"grad_norm": 2.0877629068062373,
"learning_rate": 1.083641158508955e-06,
"loss": 0.0609,
"step": 1068
},
{
"epoch": 1.5743740795287187,
"grad_norm": 2.7312069426846075,
"learning_rate": 1.0764606889338997e-06,
"loss": 0.073,
"step": 1069
},
{
"epoch": 1.5758468335787923,
"grad_norm": 2.6538924576288005,
"learning_rate": 1.0693012173631346e-06,
"loss": 0.0571,
"step": 1070
},
{
"epoch": 1.577319587628866,
"grad_norm": 2.3418650892508803,
"learning_rate": 1.062162782112729e-06,
"loss": 0.0635,
"step": 1071
},
{
"epoch": 1.5787923416789398,
"grad_norm": 2.9870001943313262,
"learning_rate": 1.055045421386175e-06,
"loss": 0.0713,
"step": 1072
},
{
"epoch": 1.5802650957290134,
"grad_norm": 2.9325987045082753,
"learning_rate": 1.0479491732741747e-06,
"loss": 0.0553,
"step": 1073
},
{
"epoch": 1.581737849779087,
"grad_norm": 2.3181702748888973,
"learning_rate": 1.0408740757544416e-06,
"loss": 0.0586,
"step": 1074
},
{
"epoch": 1.5832106038291607,
"grad_norm": 2.5995727377845337,
"learning_rate": 1.0338201666914942e-06,
"loss": 0.057,
"step": 1075
},
{
"epoch": 1.5846833578792343,
"grad_norm": 2.34412342665819,
"learning_rate": 1.0267874838364561e-06,
"loss": 0.068,
"step": 1076
},
{
"epoch": 1.586156111929308,
"grad_norm": 2.761988559212666,
"learning_rate": 1.0197760648268485e-06,
"loss": 0.0581,
"step": 1077
},
{
"epoch": 1.5876288659793816,
"grad_norm": 3.2149969036078456,
"learning_rate": 1.012785947186397e-06,
"loss": 0.0795,
"step": 1078
},
{
"epoch": 1.5891016200294552,
"grad_norm": 2.34613943148429,
"learning_rate": 1.0058171683248246e-06,
"loss": 0.058,
"step": 1079
},
{
"epoch": 1.5905743740795288,
"grad_norm": 2.065050664858047,
"learning_rate": 9.988697655376544e-07,
"loss": 0.0592,
"step": 1080
},
{
"epoch": 1.5920471281296025,
"grad_norm": 2.735250740838054,
"learning_rate": 9.919437760060075e-07,
"loss": 0.0857,
"step": 1081
},
{
"epoch": 1.593519882179676,
"grad_norm": 1.811552401317801,
"learning_rate": 9.850392367964085e-07,
"loss": 0.0519,
"step": 1082
},
{
"epoch": 1.5949926362297497,
"grad_norm": 2.562566270590748,
"learning_rate": 9.781561848605775e-07,
"loss": 0.0602,
"step": 1083
},
{
"epoch": 1.5964653902798234,
"grad_norm": 1.8016132067357058,
"learning_rate": 9.71294657035247e-07,
"loss": 0.0533,
"step": 1084
},
{
"epoch": 1.597938144329897,
"grad_norm": 2.390912745981556,
"learning_rate": 9.644546900419533e-07,
"loss": 0.0543,
"step": 1085
},
{
"epoch": 1.5994108983799706,
"grad_norm": 2.7902763091008422,
"learning_rate": 9.576363204868417e-07,
"loss": 0.0937,
"step": 1086
},
{
"epoch": 1.6008836524300443,
"grad_norm": 2.2988343421638042,
"learning_rate": 9.508395848604757e-07,
"loss": 0.0524,
"step": 1087
},
{
"epoch": 1.602356406480118,
"grad_norm": 2.4934736154800774,
"learning_rate": 9.440645195376341e-07,
"loss": 0.0491,
"step": 1088
},
{
"epoch": 1.6038291605301915,
"grad_norm": 2.6541495173985954,
"learning_rate": 9.373111607771241e-07,
"loss": 0.0981,
"step": 1089
},
{
"epoch": 1.6053019145802652,
"grad_norm": 2.633404350553692,
"learning_rate": 9.305795447215827e-07,
"loss": 0.0612,
"step": 1090
},
{
"epoch": 1.6067746686303388,
"grad_norm": 3.7648499982526973,
"learning_rate": 9.23869707397283e-07,
"loss": 0.1132,
"step": 1091
},
{
"epoch": 1.6082474226804124,
"grad_norm": 1.9284626831978018,
"learning_rate": 9.171816847139447e-07,
"loss": 0.0398,
"step": 1092
},
{
"epoch": 1.609720176730486,
"grad_norm": 2.0042841970888583,
"learning_rate": 9.105155124645403e-07,
"loss": 0.0408,
"step": 1093
},
{
"epoch": 1.6111929307805597,
"grad_norm": 1.9421306323683327,
"learning_rate": 9.038712263250982e-07,
"loss": 0.0538,
"step": 1094
},
{
"epoch": 1.6126656848306333,
"grad_norm": 2.377851383188798,
"learning_rate": 8.972488618545222e-07,
"loss": 0.0679,
"step": 1095
},
{
"epoch": 1.614138438880707,
"grad_norm": 2.3273836911909354,
"learning_rate": 8.906484544943933e-07,
"loss": 0.0723,
"step": 1096
},
{
"epoch": 1.6156111929307806,
"grad_norm": 1.9062923841071093,
"learning_rate": 8.840700395687824e-07,
"loss": 0.0561,
"step": 1097
},
{
"epoch": 1.6170839469808542,
"grad_norm": 2.973212015922115,
"learning_rate": 8.775136522840622e-07,
"loss": 0.075,
"step": 1098
},
{
"epoch": 1.6185567010309279,
"grad_norm": 2.2735420808341327,
"learning_rate": 8.709793277287182e-07,
"loss": 0.0801,
"step": 1099
},
{
"epoch": 1.6200294550810015,
"grad_norm": 2.388145827824408,
"learning_rate": 8.64467100873157e-07,
"loss": 0.0623,
"step": 1100
},
{
"epoch": 1.6215022091310751,
"grad_norm": 2.2114732024014114,
"learning_rate": 8.579770065695264e-07,
"loss": 0.0723,
"step": 1101
},
{
"epoch": 1.6229749631811488,
"grad_norm": 2.067832894100951,
"learning_rate": 8.515090795515247e-07,
"loss": 0.0563,
"step": 1102
},
{
"epoch": 1.6244477172312224,
"grad_norm": 2.464224803989418,
"learning_rate": 8.450633544342135e-07,
"loss": 0.0916,
"step": 1103
},
{
"epoch": 1.625920471281296,
"grad_norm": 2.868289318235728,
"learning_rate": 8.386398657138356e-07,
"loss": 0.0595,
"step": 1104
},
{
"epoch": 1.6273932253313697,
"grad_norm": 2.4491602380510047,
"learning_rate": 8.322386477676309e-07,
"loss": 0.0615,
"step": 1105
},
{
"epoch": 1.6288659793814433,
"grad_norm": 1.9912741641293554,
"learning_rate": 8.258597348536452e-07,
"loss": 0.0542,
"step": 1106
},
{
"epoch": 1.630338733431517,
"grad_norm": 2.6689461386284714,
"learning_rate": 8.195031611105564e-07,
"loss": 0.0832,
"step": 1107
},
{
"epoch": 1.6318114874815906,
"grad_norm": 2.4194950903750367,
"learning_rate": 8.131689605574867e-07,
"loss": 0.0538,
"step": 1108
},
{
"epoch": 1.6332842415316642,
"grad_norm": 3.477850491611638,
"learning_rate": 8.068571670938219e-07,
"loss": 0.0912,
"step": 1109
},
{
"epoch": 1.6347569955817378,
"grad_norm": 2.864529604206013,
"learning_rate": 8.005678144990281e-07,
"loss": 0.065,
"step": 1110
},
{
"epoch": 1.6362297496318114,
"grad_norm": 2.9727127439474086,
"learning_rate": 7.943009364324733e-07,
"loss": 0.0766,
"step": 1111
},
{
"epoch": 1.637702503681885,
"grad_norm": 2.788985489850606,
"learning_rate": 7.880565664332473e-07,
"loss": 0.0754,
"step": 1112
},
{
"epoch": 1.6391752577319587,
"grad_norm": 1.9204008432196964,
"learning_rate": 7.818347379199781e-07,
"loss": 0.0459,
"step": 1113
},
{
"epoch": 1.6406480117820323,
"grad_norm": 2.638641405268061,
"learning_rate": 7.756354841906582e-07,
"loss": 0.0621,
"step": 1114
},
{
"epoch": 1.642120765832106,
"grad_norm": 2.373647501644597,
"learning_rate": 7.694588384224655e-07,
"loss": 0.0613,
"step": 1115
},
{
"epoch": 1.6435935198821796,
"grad_norm": 2.464060690229074,
"learning_rate": 7.633048336715815e-07,
"loss": 0.0643,
"step": 1116
},
{
"epoch": 1.6450662739322532,
"grad_norm": 2.173724799716864,
"learning_rate": 7.571735028730209e-07,
"loss": 0.0539,
"step": 1117
},
{
"epoch": 1.6465390279823269,
"grad_norm": 2.549627425368719,
"learning_rate": 7.510648788404501e-07,
"loss": 0.0751,
"step": 1118
},
{
"epoch": 1.6480117820324005,
"grad_norm": 2.0145805351899337,
"learning_rate": 7.449789942660119e-07,
"loss": 0.0514,
"step": 1119
},
{
"epoch": 1.6494845360824741,
"grad_norm": 2.0685293893328485,
"learning_rate": 7.389158817201541e-07,
"loss": 0.0411,
"step": 1120
},
{
"epoch": 1.6509572901325478,
"grad_norm": 2.4246510587478682,
"learning_rate": 7.328755736514537e-07,
"loss": 0.0515,
"step": 1121
},
{
"epoch": 1.6524300441826214,
"grad_norm": 2.3443764090658523,
"learning_rate": 7.268581023864402e-07,
"loss": 0.0594,
"step": 1122
},
{
"epoch": 1.653902798232695,
"grad_norm": 2.4737045558016426,
"learning_rate": 7.208635001294278e-07,
"loss": 0.055,
"step": 1123
},
{
"epoch": 1.6553755522827687,
"grad_norm": 2.510219341123867,
"learning_rate": 7.148917989623388e-07,
"loss": 0.0619,
"step": 1124
},
{
"epoch": 1.6568483063328423,
"grad_norm": 2.294118510364604,
"learning_rate": 7.089430308445317e-07,
"loss": 0.0561,
"step": 1125
},
{
"epoch": 1.658321060382916,
"grad_norm": 2.37841481598977,
"learning_rate": 7.030172276126351e-07,
"loss": 0.0741,
"step": 1126
},
{
"epoch": 1.6597938144329896,
"grad_norm": 2.4737300923136325,
"learning_rate": 6.971144209803738e-07,
"loss": 0.0713,
"step": 1127
},
{
"epoch": 1.6612665684830632,
"grad_norm": 2.4437627355321045,
"learning_rate": 6.912346425383964e-07,
"loss": 0.0654,
"step": 1128
},
{
"epoch": 1.6627393225331368,
"grad_norm": 2.2165199303595564,
"learning_rate": 6.85377923754113e-07,
"loss": 0.0653,
"step": 1129
},
{
"epoch": 1.6642120765832105,
"grad_norm": 2.2014815708680895,
"learning_rate": 6.795442959715192e-07,
"loss": 0.056,
"step": 1130
},
{
"epoch": 1.665684830633284,
"grad_norm": 2.698668279501925,
"learning_rate": 6.737337904110341e-07,
"loss": 0.0508,
"step": 1131
},
{
"epoch": 1.6671575846833577,
"grad_norm": 2.454195130183351,
"learning_rate": 6.679464381693324e-07,
"loss": 0.0557,
"step": 1132
},
{
"epoch": 1.6686303387334314,
"grad_norm": 2.423192522005641,
"learning_rate": 6.621822702191744e-07,
"loss": 0.0678,
"step": 1133
},
{
"epoch": 1.670103092783505,
"grad_norm": 2.0690565021208482,
"learning_rate": 6.564413174092443e-07,
"loss": 0.0433,
"step": 1134
},
{
"epoch": 1.6715758468335786,
"grad_norm": 2.2724584676649866,
"learning_rate": 6.507236104639842e-07,
"loss": 0.0518,
"step": 1135
},
{
"epoch": 1.6730486008836525,
"grad_norm": 2.568231491867872,
"learning_rate": 6.450291799834257e-07,
"loss": 0.0712,
"step": 1136
},
{
"epoch": 1.6745213549337261,
"grad_norm": 2.3899069257751555,
"learning_rate": 6.393580564430319e-07,
"loss": 0.0521,
"step": 1137
},
{
"epoch": 1.6759941089837997,
"grad_norm": 2.7169102576317905,
"learning_rate": 6.337102701935322e-07,
"loss": 0.0796,
"step": 1138
},
{
"epoch": 1.6774668630338734,
"grad_norm": 2.1590618186403026,
"learning_rate": 6.28085851460758e-07,
"loss": 0.0588,
"step": 1139
},
{
"epoch": 1.678939617083947,
"grad_norm": 2.1974111488329946,
"learning_rate": 6.224848303454828e-07,
"loss": 0.0775,
"step": 1140
},
{
"epoch": 1.6804123711340206,
"grad_norm": 2.4856649528523955,
"learning_rate": 6.16907236823262e-07,
"loss": 0.0653,
"step": 1141
},
{
"epoch": 1.6818851251840943,
"grad_norm": 2.4487555816337236,
"learning_rate": 6.11353100744268e-07,
"loss": 0.0896,
"step": 1142
},
{
"epoch": 1.683357879234168,
"grad_norm": 2.318930173137003,
"learning_rate": 6.058224518331374e-07,
"loss": 0.0779,
"step": 1143
},
{
"epoch": 1.6848306332842415,
"grad_norm": 2.4160212230081566,
"learning_rate": 6.003153196888045e-07,
"loss": 0.0515,
"step": 1144
},
{
"epoch": 1.6863033873343152,
"grad_norm": 2.4777587223820334,
"learning_rate": 5.948317337843501e-07,
"loss": 0.0686,
"step": 1145
},
{
"epoch": 1.6877761413843888,
"grad_norm": 2.6254920597873945,
"learning_rate": 5.893717234668383e-07,
"loss": 0.0876,
"step": 1146
},
{
"epoch": 1.6892488954344624,
"grad_norm": 2.4925800987500937,
"learning_rate": 5.839353179571617e-07,
"loss": 0.0753,
"step": 1147
},
{
"epoch": 1.690721649484536,
"grad_norm": 2.5508449281167724,
"learning_rate": 5.785225463498828e-07,
"loss": 0.0603,
"step": 1148
},
{
"epoch": 1.6921944035346097,
"grad_norm": 1.805650361123776,
"learning_rate": 5.731334376130826e-07,
"loss": 0.0419,
"step": 1149
},
{
"epoch": 1.6936671575846833,
"grad_norm": 2.4585168217267785,
"learning_rate": 5.67768020588203e-07,
"loss": 0.0576,
"step": 1150
},
{
"epoch": 1.695139911634757,
"grad_norm": 3.7525272205812903,
"learning_rate": 5.624263239898909e-07,
"loss": 0.098,
"step": 1151
},
{
"epoch": 1.6966126656848306,
"grad_norm": 2.3382149888369628,
"learning_rate": 5.571083764058482e-07,
"loss": 0.0688,
"step": 1152
},
{
"epoch": 1.6980854197349042,
"grad_norm": 2.472002833620757,
"learning_rate": 5.518142062966759e-07,
"loss": 0.0607,
"step": 1153
},
{
"epoch": 1.6995581737849779,
"grad_norm": 2.6624553849748076,
"learning_rate": 5.465438419957209e-07,
"loss": 0.0868,
"step": 1154
},
{
"epoch": 1.7010309278350515,
"grad_norm": 2.330089465817626,
"learning_rate": 5.412973117089288e-07,
"loss": 0.0703,
"step": 1155
},
{
"epoch": 1.7025036818851251,
"grad_norm": 2.487074034850233,
"learning_rate": 5.360746435146885e-07,
"loss": 0.0628,
"step": 1156
},
{
"epoch": 1.703976435935199,
"grad_norm": 2.0361552942415306,
"learning_rate": 5.308758653636853e-07,
"loss": 0.0414,
"step": 1157
},
{
"epoch": 1.7054491899852726,
"grad_norm": 3.1094877949682234,
"learning_rate": 5.257010050787487e-07,
"loss": 0.0686,
"step": 1158
},
{
"epoch": 1.7069219440353463,
"grad_norm": 2.4861661405757474,
"learning_rate": 5.20550090354705e-07,
"loss": 0.0504,
"step": 1159
},
{
"epoch": 1.7083946980854199,
"grad_norm": 2.528562665713088,
"learning_rate": 5.154231487582273e-07,
"loss": 0.0655,
"step": 1160
},
{
"epoch": 1.7098674521354935,
"grad_norm": 2.4106846342395967,
"learning_rate": 5.103202077276908e-07,
"loss": 0.0726,
"step": 1161
},
{
"epoch": 1.7113402061855671,
"grad_norm": 2.0306440917007156,
"learning_rate": 5.05241294573024e-07,
"loss": 0.0596,
"step": 1162
},
{
"epoch": 1.7128129602356408,
"grad_norm": 2.2743774239571604,
"learning_rate": 5.001864364755626e-07,
"loss": 0.0645,
"step": 1163
},
{
"epoch": 1.7142857142857144,
"grad_norm": 2.5250273423806444,
"learning_rate": 4.951556604879049e-07,
"loss": 0.0874,
"step": 1164
},
{
"epoch": 1.715758468335788,
"grad_norm": 2.1645897297660928,
"learning_rate": 4.901489935337661e-07,
"loss": 0.0533,
"step": 1165
},
{
"epoch": 1.7172312223858617,
"grad_norm": 3.2693852161381383,
"learning_rate": 4.851664624078356e-07,
"loss": 0.0956,
"step": 1166
},
{
"epoch": 1.7187039764359353,
"grad_norm": 2.0954735704775116,
"learning_rate": 4.802080937756304e-07,
"loss": 0.0408,
"step": 1167
},
{
"epoch": 1.720176730486009,
"grad_norm": 2.4457829307784307,
"learning_rate": 4.7527391417335647e-07,
"loss": 0.0534,
"step": 1168
},
{
"epoch": 1.7216494845360826,
"grad_norm": 2.2487291126239612,
"learning_rate": 4.7036395000776556e-07,
"loss": 0.0691,
"step": 1169
},
{
"epoch": 1.7231222385861562,
"grad_norm": 3.295076111621927,
"learning_rate": 4.654782275560127e-07,
"loss": 0.0747,
"step": 1170
},
{
"epoch": 1.7245949926362298,
"grad_norm": 1.6861287640422942,
"learning_rate": 4.606167729655131e-07,
"loss": 0.0427,
"step": 1171
},
{
"epoch": 1.7260677466863035,
"grad_norm": 1.7666774627400288,
"learning_rate": 4.5577961225380886e-07,
"loss": 0.0512,
"step": 1172
},
{
"epoch": 1.727540500736377,
"grad_norm": 2.787550564417676,
"learning_rate": 4.5096677130842535e-07,
"loss": 0.0511,
"step": 1173
},
{
"epoch": 1.7290132547864507,
"grad_norm": 2.8606514719301486,
"learning_rate": 4.4617827588673167e-07,
"loss": 0.0617,
"step": 1174
},
{
"epoch": 1.7304860088365244,
"grad_norm": 3.0176679136529696,
"learning_rate": 4.4141415161580714e-07,
"loss": 0.0627,
"step": 1175
},
{
"epoch": 1.731958762886598,
"grad_norm": 2.147973320636422,
"learning_rate": 4.3667442399229985e-07,
"loss": 0.0577,
"step": 1176
},
{
"epoch": 1.7334315169366716,
"grad_norm": 2.4334937107986874,
"learning_rate": 4.319591183822902e-07,
"loss": 0.0654,
"step": 1177
},
{
"epoch": 1.7349042709867453,
"grad_norm": 2.1068870849677186,
"learning_rate": 4.272682600211608e-07,
"loss": 0.0641,
"step": 1178
},
{
"epoch": 1.736377025036819,
"grad_norm": 2.3360669303149795,
"learning_rate": 4.226018740134541e-07,
"loss": 0.0425,
"step": 1179
},
{
"epoch": 1.7378497790868925,
"grad_norm": 2.6420334626364,
"learning_rate": 4.179599853327426e-07,
"loss": 0.049,
"step": 1180
},
{
"epoch": 1.7393225331369662,
"grad_norm": 3.1620216422362097,
"learning_rate": 4.1334261882149505e-07,
"loss": 0.085,
"step": 1181
},
{
"epoch": 1.7407952871870398,
"grad_norm": 1.9512418086161625,
"learning_rate": 4.0874979919094004e-07,
"loss": 0.0471,
"step": 1182
},
{
"epoch": 1.7422680412371134,
"grad_norm": 1.9156501885886787,
"learning_rate": 4.041815510209396e-07,
"loss": 0.0449,
"step": 1183
},
{
"epoch": 1.743740795287187,
"grad_norm": 2.5213922767677106,
"learning_rate": 3.996378987598487e-07,
"loss": 0.0747,
"step": 1184
},
{
"epoch": 1.7452135493372607,
"grad_norm": 2.329334413664428,
"learning_rate": 3.9511886672439546e-07,
"loss": 0.0409,
"step": 1185
},
{
"epoch": 1.7466863033873343,
"grad_norm": 2.0605800429804786,
"learning_rate": 3.906244790995423e-07,
"loss": 0.0527,
"step": 1186
},
{
"epoch": 1.748159057437408,
"grad_norm": 2.0395950012587583,
"learning_rate": 3.8615475993836026e-07,
"loss": 0.068,
"step": 1187
},
{
"epoch": 1.7496318114874816,
"grad_norm": 2.4728396294735604,
"learning_rate": 3.8170973316190074e-07,
"loss": 0.065,
"step": 1188
},
{
"epoch": 1.7511045655375552,
"grad_norm": 2.5630343389473014,
"learning_rate": 3.7728942255906565e-07,
"loss": 0.0783,
"step": 1189
},
{
"epoch": 1.7525773195876289,
"grad_norm": 2.4356317818958217,
"learning_rate": 3.728938517864794e-07,
"loss": 0.0552,
"step": 1190
},
{
"epoch": 1.7540500736377025,
"grad_norm": 2.439624630915906,
"learning_rate": 3.6852304436836573e-07,
"loss": 0.0768,
"step": 1191
},
{
"epoch": 1.7555228276877761,
"grad_norm": 2.254303368671749,
"learning_rate": 3.641770236964193e-07,
"loss": 0.0906,
"step": 1192
},
{
"epoch": 1.7569955817378498,
"grad_norm": 2.617239568329721,
"learning_rate": 3.598558130296809e-07,
"loss": 0.072,
"step": 1193
},
{
"epoch": 1.7584683357879234,
"grad_norm": 2.340193575135741,
"learning_rate": 3.555594354944125e-07,
"loss": 0.0565,
"step": 1194
},
{
"epoch": 1.759941089837997,
"grad_norm": 1.9253376206694792,
"learning_rate": 3.5128791408397646e-07,
"loss": 0.0398,
"step": 1195
},
{
"epoch": 1.7614138438880707,
"grad_norm": 2.155096832471741,
"learning_rate": 3.4704127165870514e-07,
"loss": 0.0566,
"step": 1196
},
{
"epoch": 1.7628865979381443,
"grad_norm": 2.5829208788530145,
"learning_rate": 3.4281953094578877e-07,
"loss": 0.0608,
"step": 1197
},
{
"epoch": 1.764359351988218,
"grad_norm": 2.849311300023252,
"learning_rate": 3.386227145391463e-07,
"loss": 0.0819,
"step": 1198
},
{
"epoch": 1.7658321060382915,
"grad_norm": 2.8626031957772544,
"learning_rate": 3.3445084489930613e-07,
"loss": 0.076,
"step": 1199
},
{
"epoch": 1.7673048600883652,
"grad_norm": 2.1516838804074943,
"learning_rate": 3.303039443532874e-07,
"loss": 0.0509,
"step": 1200
},
{
"epoch": 1.7687776141384388,
"grad_norm": 2.4872295275914786,
"learning_rate": 3.2618203509448054e-07,
"loss": 0.0632,
"step": 1201
},
{
"epoch": 1.7702503681885124,
"grad_norm": 2.3779433828022194,
"learning_rate": 3.220851391825247e-07,
"loss": 0.0639,
"step": 1202
},
{
"epoch": 1.771723122238586,
"grad_norm": 2.133404720057926,
"learning_rate": 3.18013278543195e-07,
"loss": 0.0449,
"step": 1203
},
{
"epoch": 1.7731958762886597,
"grad_norm": 2.1635728674893837,
"learning_rate": 3.1396647496828245e-07,
"loss": 0.0566,
"step": 1204
},
{
"epoch": 1.7746686303387333,
"grad_norm": 1.8474795208949297,
"learning_rate": 3.0994475011547675e-07,
"loss": 0.0538,
"step": 1205
},
{
"epoch": 1.776141384388807,
"grad_norm": 2.6491356589687975,
"learning_rate": 3.0594812550825194e-07,
"loss": 0.0806,
"step": 1206
},
{
"epoch": 1.7776141384388806,
"grad_norm": 1.9201660665459854,
"learning_rate": 3.0197662253575123e-07,
"loss": 0.0462,
"step": 1207
},
{
"epoch": 1.7790868924889542,
"grad_norm": 2.456247251421313,
"learning_rate": 2.980302624526693e-07,
"loss": 0.0682,
"step": 1208
},
{
"epoch": 1.7805596465390279,
"grad_norm": 2.0836624878249435,
"learning_rate": 2.941090663791435e-07,
"loss": 0.0581,
"step": 1209
},
{
"epoch": 1.7820324005891015,
"grad_norm": 2.2164461597934504,
"learning_rate": 2.902130553006366e-07,
"loss": 0.0693,
"step": 1210
},
{
"epoch": 1.7835051546391751,
"grad_norm": 3.134611247374976,
"learning_rate": 2.8634225006782867e-07,
"loss": 0.0902,
"step": 1211
},
{
"epoch": 1.7849779086892488,
"grad_norm": 2.2452464988460465,
"learning_rate": 2.8249667139650215e-07,
"loss": 0.0542,
"step": 1212
},
{
"epoch": 1.7864506627393224,
"grad_norm": 2.733041617005635,
"learning_rate": 2.786763398674297e-07,
"loss": 0.0471,
"step": 1213
},
{
"epoch": 1.787923416789396,
"grad_norm": 2.7893962181248027,
"learning_rate": 2.748812759262687e-07,
"loss": 0.0832,
"step": 1214
},
{
"epoch": 1.7893961708394697,
"grad_norm": 1.9564030920988806,
"learning_rate": 2.711114998834485e-07,
"loss": 0.0552,
"step": 1215
},
{
"epoch": 1.7908689248895433,
"grad_norm": 2.8778504339141393,
"learning_rate": 2.6736703191406366e-07,
"loss": 0.0642,
"step": 1216
},
{
"epoch": 1.792341678939617,
"grad_norm": 2.5297695421363136,
"learning_rate": 2.636478920577634e-07,
"loss": 0.0529,
"step": 1217
},
{
"epoch": 1.7938144329896906,
"grad_norm": 2.3107730882334594,
"learning_rate": 2.599541002186479e-07,
"loss": 0.06,
"step": 1218
},
{
"epoch": 1.7952871870397642,
"grad_norm": 2.5790224501462373,
"learning_rate": 2.5628567616515667e-07,
"loss": 0.0666,
"step": 1219
},
{
"epoch": 1.7967599410898378,
"grad_norm": 2.6469259537969765,
"learning_rate": 2.5264263952996915e-07,
"loss": 0.074,
"step": 1220
},
{
"epoch": 1.7982326951399117,
"grad_norm": 2.9407112954766643,
"learning_rate": 2.49025009809894e-07,
"loss": 0.0851,
"step": 1221
},
{
"epoch": 1.7997054491899853,
"grad_norm": 2.3658889493345203,
"learning_rate": 2.4543280636576795e-07,
"loss": 0.0769,
"step": 1222
},
{
"epoch": 1.801178203240059,
"grad_norm": 2.3869878273402567,
"learning_rate": 2.4186604842235285e-07,
"loss": 0.0632,
"step": 1223
},
{
"epoch": 1.8026509572901326,
"grad_norm": 2.5921093452272777,
"learning_rate": 2.3832475506822937e-07,
"loss": 0.0607,
"step": 1224
},
{
"epoch": 1.8041237113402062,
"grad_norm": 2.460066877289172,
"learning_rate": 2.3480894525569564e-07,
"loss": 0.0616,
"step": 1225
},
{
"epoch": 1.8055964653902798,
"grad_norm": 2.2895807092767524,
"learning_rate": 2.3131863780067043e-07,
"loss": 0.0469,
"step": 1226
},
{
"epoch": 1.8070692194403535,
"grad_norm": 2.21813812283152,
"learning_rate": 2.278538513825862e-07,
"loss": 0.0581,
"step": 1227
},
{
"epoch": 1.8085419734904271,
"grad_norm": 2.40243611426377,
"learning_rate": 2.2441460454429298e-07,
"loss": 0.0554,
"step": 1228
},
{
"epoch": 1.8100147275405007,
"grad_norm": 3.508821815855263,
"learning_rate": 2.2100091569195735e-07,
"loss": 0.0617,
"step": 1229
},
{
"epoch": 1.8114874815905744,
"grad_norm": 2.5524630260490344,
"learning_rate": 2.1761280309496645e-07,
"loss": 0.0523,
"step": 1230
},
{
"epoch": 1.812960235640648,
"grad_norm": 2.8113266089557647,
"learning_rate": 2.1425028488582523e-07,
"loss": 0.0699,
"step": 1231
},
{
"epoch": 1.8144329896907216,
"grad_norm": 2.6452328788944666,
"learning_rate": 2.109133790600648e-07,
"loss": 0.065,
"step": 1232
},
{
"epoch": 1.8159057437407953,
"grad_norm": 2.8961481755343708,
"learning_rate": 2.0760210347614383e-07,
"loss": 0.0672,
"step": 1233
},
{
"epoch": 1.817378497790869,
"grad_norm": 2.477715909813392,
"learning_rate": 2.043164758553523e-07,
"loss": 0.0659,
"step": 1234
},
{
"epoch": 1.8188512518409425,
"grad_norm": 2.2219672216441784,
"learning_rate": 2.010565137817172e-07,
"loss": 0.0642,
"step": 1235
},
{
"epoch": 1.8203240058910162,
"grad_norm": 2.807219268334985,
"learning_rate": 1.9782223470191043e-07,
"loss": 0.0887,
"step": 1236
},
{
"epoch": 1.8217967599410898,
"grad_norm": 1.7331081934051469,
"learning_rate": 1.9461365592515103e-07,
"loss": 0.0462,
"step": 1237
},
{
"epoch": 1.8232695139911634,
"grad_norm": 2.487498627433016,
"learning_rate": 1.9143079462311644e-07,
"loss": 0.0553,
"step": 1238
},
{
"epoch": 1.824742268041237,
"grad_norm": 2.395541301875943,
"learning_rate": 1.8827366782984913e-07,
"loss": 0.064,
"step": 1239
},
{
"epoch": 1.8262150220913107,
"grad_norm": 2.4821082624397315,
"learning_rate": 1.851422924416657e-07,
"loss": 0.0588,
"step": 1240
},
{
"epoch": 1.8276877761413843,
"grad_norm": 1.7953626408649892,
"learning_rate": 1.820366852170663e-07,
"loss": 0.0572,
"step": 1241
},
{
"epoch": 1.8291605301914582,
"grad_norm": 2.263788364815126,
"learning_rate": 1.789568627766447e-07,
"loss": 0.0564,
"step": 1242
},
{
"epoch": 1.8306332842415318,
"grad_norm": 2.548871383114113,
"learning_rate": 1.7590284160300065e-07,
"loss": 0.0885,
"step": 1243
},
{
"epoch": 1.8321060382916055,
"grad_norm": 1.9553559535921305,
"learning_rate": 1.7287463804064874e-07,
"loss": 0.0515,
"step": 1244
},
{
"epoch": 1.833578792341679,
"grad_norm": 2.1413849397508455,
"learning_rate": 1.6987226829593417e-07,
"loss": 0.0525,
"step": 1245
},
{
"epoch": 1.8350515463917527,
"grad_norm": 2.4170731730516435,
"learning_rate": 1.6689574843694433e-07,
"loss": 0.0536,
"step": 1246
},
{
"epoch": 1.8365243004418264,
"grad_norm": 2.26376563536471,
"learning_rate": 1.6394509439342343e-07,
"loss": 0.064,
"step": 1247
},
{
"epoch": 1.8379970544919,
"grad_norm": 2.354566095562083,
"learning_rate": 1.6102032195668639e-07,
"loss": 0.054,
"step": 1248
},
{
"epoch": 1.8394698085419736,
"grad_norm": 2.200305194780377,
"learning_rate": 1.5812144677953667e-07,
"loss": 0.0597,
"step": 1249
},
{
"epoch": 1.8409425625920472,
"grad_norm": 2.8036671730837996,
"learning_rate": 1.5524848437617757e-07,
"loss": 0.0788,
"step": 1250
},
{
"epoch": 1.8424153166421209,
"grad_norm": 2.9640155658781513,
"learning_rate": 1.5240145012213438e-07,
"loss": 0.0712,
"step": 1251
},
{
"epoch": 1.8438880706921945,
"grad_norm": 2.676706527145378,
"learning_rate": 1.4958035925417002e-07,
"loss": 0.1007,
"step": 1252
},
{
"epoch": 1.8453608247422681,
"grad_norm": 2.5173421640989733,
"learning_rate": 1.4678522687020414e-07,
"loss": 0.0642,
"step": 1253
},
{
"epoch": 1.8468335787923418,
"grad_norm": 2.4792807105107726,
"learning_rate": 1.4401606792923018e-07,
"loss": 0.0733,
"step": 1254
},
{
"epoch": 1.8483063328424154,
"grad_norm": 2.590202533627658,
"learning_rate": 1.4127289725123783e-07,
"loss": 0.0637,
"step": 1255
},
{
"epoch": 1.849779086892489,
"grad_norm": 2.4701164236940247,
"learning_rate": 1.3855572951713247e-07,
"loss": 0.0549,
"step": 1256
},
{
"epoch": 1.8512518409425627,
"grad_norm": 2.667366269263811,
"learning_rate": 1.3586457926865692e-07,
"loss": 0.0563,
"step": 1257
},
{
"epoch": 1.8527245949926363,
"grad_norm": 2.534826082762876,
"learning_rate": 1.3319946090831372e-07,
"loss": 0.0667,
"step": 1258
},
{
"epoch": 1.85419734904271,
"grad_norm": 3.184285823956059,
"learning_rate": 1.3056038869928732e-07,
"loss": 0.0708,
"step": 1259
},
{
"epoch": 1.8556701030927836,
"grad_norm": 2.6334408231116644,
"learning_rate": 1.2794737676536993e-07,
"loss": 0.0679,
"step": 1260
},
{
"epoch": 1.8571428571428572,
"grad_norm": 3.705317330764747,
"learning_rate": 1.253604390908819e-07,
"loss": 0.0684,
"step": 1261
},
{
"epoch": 1.8586156111929308,
"grad_norm": 2.6760578559720045,
"learning_rate": 1.2279958952060133e-07,
"loss": 0.061,
"step": 1262
},
{
"epoch": 1.8600883652430045,
"grad_norm": 2.683637739242373,
"learning_rate": 1.2026484175968744e-07,
"loss": 0.0697,
"step": 1263
},
{
"epoch": 1.861561119293078,
"grad_norm": 3.2978111974990347,
"learning_rate": 1.1775620937360677e-07,
"loss": 0.0753,
"step": 1264
},
{
"epoch": 1.8630338733431517,
"grad_norm": 2.4803177953803406,
"learning_rate": 1.1527370578806318e-07,
"loss": 0.0625,
"step": 1265
},
{
"epoch": 1.8645066273932254,
"grad_norm": 2.726915019647307,
"learning_rate": 1.128173442889241e-07,
"loss": 0.0755,
"step": 1266
},
{
"epoch": 1.865979381443299,
"grad_norm": 2.678703270186324,
"learning_rate": 1.1038713802214718e-07,
"loss": 0.0554,
"step": 1267
},
{
"epoch": 1.8674521354933726,
"grad_norm": 1.7752650933266636,
"learning_rate": 1.0798309999371537e-07,
"loss": 0.0522,
"step": 1268
},
{
"epoch": 1.8689248895434463,
"grad_norm": 2.6407719120165236,
"learning_rate": 1.0560524306956422e-07,
"loss": 0.0697,
"step": 1269
},
{
"epoch": 1.87039764359352,
"grad_norm": 2.9033590642091496,
"learning_rate": 1.0325357997551133e-07,
"loss": 0.0634,
"step": 1270
},
{
"epoch": 1.8718703976435935,
"grad_norm": 1.9492796659994043,
"learning_rate": 1.0092812329719149e-07,
"loss": 0.0489,
"step": 1271
},
{
"epoch": 1.8733431516936672,
"grad_norm": 2.4516366920256045,
"learning_rate": 9.862888547998828e-08,
"loss": 0.0591,
"step": 1272
},
{
"epoch": 1.8748159057437408,
"grad_norm": 2.464207142138413,
"learning_rate": 9.635587882896591e-08,
"loss": 0.0658,
"step": 1273
},
{
"epoch": 1.8762886597938144,
"grad_norm": 2.6681818756726314,
"learning_rate": 9.410911550880474e-08,
"loss": 0.0651,
"step": 1274
},
{
"epoch": 1.877761413843888,
"grad_norm": 2.95156310869132,
"learning_rate": 9.188860754373751e-08,
"loss": 0.0627,
"step": 1275
},
{
"epoch": 1.8792341678939617,
"grad_norm": 2.664762503107241,
"learning_rate": 8.969436681748211e-08,
"loss": 0.0712,
"step": 1276
},
{
"epoch": 1.8807069219440353,
"grad_norm": 2.309055318081491,
"learning_rate": 8.752640507317944e-08,
"loss": 0.0674,
"step": 1277
},
{
"epoch": 1.882179675994109,
"grad_norm": 3.571620633943013,
"learning_rate": 8.53847339133318e-08,
"loss": 0.0989,
"step": 1278
},
{
"epoch": 1.8836524300441826,
"grad_norm": 3.4554242703648077,
"learning_rate": 8.326936479973735e-08,
"loss": 0.1032,
"step": 1279
},
{
"epoch": 1.8851251840942562,
"grad_norm": 2.3511892419309226,
"learning_rate": 8.118030905343244e-08,
"loss": 0.0587,
"step": 1280
},
{
"epoch": 1.8865979381443299,
"grad_norm": 2.9875875390382176,
"learning_rate": 7.911757785462882e-08,
"loss": 0.0771,
"step": 1281
},
{
"epoch": 1.8880706921944035,
"grad_norm": 2.667124591832099,
"learning_rate": 7.708118224265538e-08,
"loss": 0.0726,
"step": 1282
},
{
"epoch": 1.8895434462444771,
"grad_norm": 2.808103542460803,
"learning_rate": 7.507113311589764e-08,
"loss": 0.0725,
"step": 1283
},
{
"epoch": 1.8910162002945508,
"grad_norm": 2.138310695092727,
"learning_rate": 7.308744123174006e-08,
"loss": 0.053,
"step": 1284
},
{
"epoch": 1.8924889543446244,
"grad_norm": 3.23592781169601,
"learning_rate": 7.113011720650709e-08,
"loss": 0.0861,
"step": 1285
},
{
"epoch": 1.893961708394698,
"grad_norm": 2.7938972462543603,
"learning_rate": 6.919917151540944e-08,
"loss": 0.0846,
"step": 1286
},
{
"epoch": 1.8954344624447717,
"grad_norm": 2.6676277265873445,
"learning_rate": 6.72946144924852e-08,
"loss": 0.0582,
"step": 1287
},
{
"epoch": 1.8969072164948453,
"grad_norm": 2.292320346160715,
"learning_rate": 6.54164563305465e-08,
"loss": 0.0683,
"step": 1288
},
{
"epoch": 1.898379970544919,
"grad_norm": 2.7181388928578416,
"learning_rate": 6.356470708112295e-08,
"loss": 0.0684,
"step": 1289
},
{
"epoch": 1.8998527245949925,
"grad_norm": 2.42255910784497,
"learning_rate": 6.173937665440943e-08,
"loss": 0.0863,
"step": 1290
},
{
"epoch": 1.9013254786450662,
"grad_norm": 2.0005300068986216,
"learning_rate": 5.994047481921283e-08,
"loss": 0.052,
"step": 1291
},
{
"epoch": 1.9027982326951398,
"grad_norm": 2.0006374023475018,
"learning_rate": 5.816801120289761e-08,
"loss": 0.0494,
"step": 1292
},
{
"epoch": 1.9042709867452134,
"grad_norm": 2.761206395395843,
"learning_rate": 5.642199529133918e-08,
"loss": 0.0711,
"step": 1293
},
{
"epoch": 1.905743740795287,
"grad_norm": 2.7407148804823604,
"learning_rate": 5.47024364288673e-08,
"loss": 0.0862,
"step": 1294
},
{
"epoch": 1.9072164948453607,
"grad_norm": 3.091222103037569,
"learning_rate": 5.3009343818219985e-08,
"loss": 0.0756,
"step": 1295
},
{
"epoch": 1.9086892488954343,
"grad_norm": 2.163303706955241,
"learning_rate": 5.13427265204941e-08,
"loss": 0.0478,
"step": 1296
},
{
"epoch": 1.910162002945508,
"grad_norm": 2.773239322890956,
"learning_rate": 4.970259345509376e-08,
"loss": 0.0972,
"step": 1297
},
{
"epoch": 1.9116347569955816,
"grad_norm": 2.608970295810049,
"learning_rate": 4.808895339968644e-08,
"loss": 0.0604,
"step": 1298
},
{
"epoch": 1.9131075110456552,
"grad_norm": 2.660546445460094,
"learning_rate": 4.650181499015416e-08,
"loss": 0.0582,
"step": 1299
},
{
"epoch": 1.9145802650957289,
"grad_norm": 2.7839755861009947,
"learning_rate": 4.4941186720546257e-08,
"loss": 0.0592,
"step": 1300
},
{
"epoch": 1.9160530191458025,
"grad_norm": 2.5610344626678065,
"learning_rate": 4.340707694303614e-08,
"loss": 0.0818,
"step": 1301
},
{
"epoch": 1.9175257731958761,
"grad_norm": 2.1786156029620045,
"learning_rate": 4.189949386787462e-08,
"loss": 0.0642,
"step": 1302
},
{
"epoch": 1.9189985272459498,
"grad_norm": 3.1670317662030696,
"learning_rate": 4.041844556334717e-08,
"loss": 0.0922,
"step": 1303
},
{
"epoch": 1.9204712812960234,
"grad_norm": 2.5277508443914942,
"learning_rate": 3.896393995573178e-08,
"loss": 0.052,
"step": 1304
},
{
"epoch": 1.9219440353460973,
"grad_norm": 2.4096715395360824,
"learning_rate": 3.75359848292528e-08,
"loss": 0.0472,
"step": 1305
},
{
"epoch": 1.923416789396171,
"grad_norm": 2.204721916769757,
"learning_rate": 3.613458782604329e-08,
"loss": 0.0543,
"step": 1306
},
{
"epoch": 1.9248895434462445,
"grad_norm": 2.5836879143448597,
"learning_rate": 3.475975644610219e-08,
"loss": 0.0758,
"step": 1307
},
{
"epoch": 1.9263622974963182,
"grad_norm": 2.457982163657938,
"learning_rate": 3.341149804725496e-08,
"loss": 0.0482,
"step": 1308
},
{
"epoch": 1.9278350515463918,
"grad_norm": 2.6231271942883345,
"learning_rate": 3.2089819845111946e-08,
"loss": 0.0758,
"step": 1309
},
{
"epoch": 1.9293078055964654,
"grad_norm": 2.5076609668403718,
"learning_rate": 3.079472891303337e-08,
"loss": 0.0641,
"step": 1310
},
{
"epoch": 1.930780559646539,
"grad_norm": 2.709588772784406,
"learning_rate": 2.9526232182088834e-08,
"loss": 0.0706,
"step": 1311
},
{
"epoch": 1.9322533136966127,
"grad_norm": 1.9492221449254272,
"learning_rate": 2.8284336441021797e-08,
"loss": 0.0744,
"step": 1312
},
{
"epoch": 1.9337260677466863,
"grad_norm": 2.599925072248245,
"learning_rate": 2.7069048336211823e-08,
"loss": 0.0816,
"step": 1313
},
{
"epoch": 1.93519882179676,
"grad_norm": 2.8220847289646156,
"learning_rate": 2.5880374371639594e-08,
"loss": 0.0563,
"step": 1314
},
{
"epoch": 1.9366715758468336,
"grad_norm": 2.27806945669308,
"learning_rate": 2.471832090885251e-08,
"loss": 0.0703,
"step": 1315
},
{
"epoch": 1.9381443298969072,
"grad_norm": 2.153300624329934,
"learning_rate": 2.358289416693027e-08,
"loss": 0.0673,
"step": 1316
},
{
"epoch": 1.9396170839469808,
"grad_norm": 2.3865823334755287,
"learning_rate": 2.2474100222451557e-08,
"loss": 0.0677,
"step": 1317
},
{
"epoch": 1.9410898379970545,
"grad_norm": 2.0839931170659742,
"learning_rate": 2.1391945009461844e-08,
"loss": 0.0515,
"step": 1318
},
{
"epoch": 1.9425625920471281,
"grad_norm": 2.804503515201076,
"learning_rate": 2.0336434319440656e-08,
"loss": 0.0953,
"step": 1319
},
{
"epoch": 1.9440353460972017,
"grad_norm": 2.3508543215655795,
"learning_rate": 1.930757380127324e-08,
"loss": 0.0465,
"step": 1320
},
{
"epoch": 1.9455081001472754,
"grad_norm": 2.2684159030677584,
"learning_rate": 1.8305368961215598e-08,
"loss": 0.0583,
"step": 1321
},
{
"epoch": 1.946980854197349,
"grad_norm": 2.5644986282122537,
"learning_rate": 1.7329825162870073e-08,
"loss": 0.055,
"step": 1322
},
{
"epoch": 1.9484536082474226,
"grad_norm": 2.283413295915947,
"learning_rate": 1.6380947627153143e-08,
"loss": 0.0695,
"step": 1323
},
{
"epoch": 1.9499263622974963,
"grad_norm": 2.069007200708932,
"learning_rate": 1.545874143226933e-08,
"loss": 0.0518,
"step": 1324
},
{
"epoch": 1.95139911634757,
"grad_norm": 2.9711317829064585,
"learning_rate": 1.456321151368345e-08,
"loss": 0.0611,
"step": 1325
},
{
"epoch": 1.9528718703976435,
"grad_norm": 2.1673859380067624,
"learning_rate": 1.3694362664094518e-08,
"loss": 0.0566,
"step": 1326
},
{
"epoch": 1.9543446244477174,
"grad_norm": 2.3741944406331803,
"learning_rate": 1.2852199533407994e-08,
"loss": 0.0767,
"step": 1327
},
{
"epoch": 1.955817378497791,
"grad_norm": 2.667214957330843,
"learning_rate": 1.2036726628715245e-08,
"loss": 0.056,
"step": 1328
},
{
"epoch": 1.9572901325478647,
"grad_norm": 2.268524748294768,
"learning_rate": 1.1247948314264678e-08,
"loss": 0.0593,
"step": 1329
},
{
"epoch": 1.9587628865979383,
"grad_norm": 2.2321912351767708,
"learning_rate": 1.0485868811441757e-08,
"loss": 0.0611,
"step": 1330
},
{
"epoch": 1.960235640648012,
"grad_norm": 2.963136564591045,
"learning_rate": 9.750492198744577e-09,
"loss": 0.0681,
"step": 1331
},
{
"epoch": 1.9617083946980856,
"grad_norm": 2.7342946960131242,
"learning_rate": 9.041822411763323e-09,
"loss": 0.0667,
"step": 1332
},
{
"epoch": 1.9631811487481592,
"grad_norm": 2.171580235749752,
"learning_rate": 8.359863243158074e-09,
"loss": 0.0624,
"step": 1333
},
{
"epoch": 1.9646539027982328,
"grad_norm": 2.2346302144602275,
"learning_rate": 7.704618342638804e-09,
"loss": 0.0535,
"step": 1334
},
{
"epoch": 1.9661266568483065,
"grad_norm": 2.5411814724089994,
"learning_rate": 7.076091216946524e-09,
"loss": 0.0613,
"step": 1335
},
{
"epoch": 1.96759941089838,
"grad_norm": 3.0903346424039713,
"learning_rate": 6.4742852298338434e-09,
"loss": 0.0601,
"step": 1336
},
{
"epoch": 1.9690721649484537,
"grad_norm": 2.468639539072872,
"learning_rate": 5.899203602046655e-09,
"loss": 0.0748,
"step": 1337
},
{
"epoch": 1.9705449189985274,
"grad_norm": 3.1384988040511006,
"learning_rate": 5.35084941130748e-09,
"loss": 0.0701,
"step": 1338
},
{
"epoch": 1.972017673048601,
"grad_norm": 2.9703348546242156,
"learning_rate": 4.829225592300479e-09,
"loss": 0.0848,
"step": 1339
},
{
"epoch": 1.9734904270986746,
"grad_norm": 1.9317883920454113,
"learning_rate": 4.334334936652029e-09,
"loss": 0.0426,
"step": 1340
},
{
"epoch": 1.9749631811487482,
"grad_norm": 2.878576339471186,
"learning_rate": 3.8661800929185035e-09,
"loss": 0.0737,
"step": 1341
},
{
"epoch": 1.9764359351988219,
"grad_norm": 2.354349775359537,
"learning_rate": 3.4247635665723977e-09,
"loss": 0.0629,
"step": 1342
},
{
"epoch": 1.9779086892488955,
"grad_norm": 3.194601011276422,
"learning_rate": 3.010087719986787e-09,
"loss": 0.0625,
"step": 1343
},
{
"epoch": 1.9793814432989691,
"grad_norm": 2.5572029437027273,
"learning_rate": 2.6221547724253337e-09,
"loss": 0.0585,
"step": 1344
},
{
"epoch": 1.9808541973490428,
"grad_norm": 2.460763245695378,
"learning_rate": 2.260966800027853e-09,
"loss": 0.0627,
"step": 1345
},
{
"epoch": 1.9823269513991164,
"grad_norm": 2.0858178223464146,
"learning_rate": 1.926525735800877e-09,
"loss": 0.0604,
"step": 1346
},
{
"epoch": 1.98379970544919,
"grad_norm": 2.60802968225033,
"learning_rate": 1.6188333696059988e-09,
"loss": 0.0681,
"step": 1347
},
{
"epoch": 1.9852724594992637,
"grad_norm": 2.896847869807856,
"learning_rate": 1.3378913481526534e-09,
"loss": 0.073,
"step": 1348
},
{
"epoch": 1.9867452135493373,
"grad_norm": 2.5968745701467513,
"learning_rate": 1.0837011749864624e-09,
"loss": 0.08,
"step": 1349
},
{
"epoch": 1.988217967599411,
"grad_norm": 2.0860913805543837,
"learning_rate": 8.562642104831265e-10,
"loss": 0.0554,
"step": 1350
},
{
"epoch": 1.9896907216494846,
"grad_norm": 2.153075308395945,
"learning_rate": 6.555816718389896e-10,
"loss": 0.0484,
"step": 1351
},
{
"epoch": 1.9911634756995582,
"grad_norm": 2.2798904474088877,
"learning_rate": 4.816546330688177e-10,
"loss": 0.0453,
"step": 1352
},
{
"epoch": 1.9926362297496318,
"grad_norm": 2.595728602874706,
"learning_rate": 3.344840249946968e-10,
"loss": 0.065,
"step": 1353
},
{
"epoch": 1.9941089837997055,
"grad_norm": 2.313139587362148,
"learning_rate": 2.1407063524436777e-10,
"loss": 0.0757,
"step": 1354
},
{
"epoch": 1.995581737849779,
"grad_norm": 2.5156912578544075,
"learning_rate": 1.2041510824678525e-10,
"loss": 0.0826,
"step": 1355
},
{
"epoch": 1.9970544918998527,
"grad_norm": 2.043220452782378,
"learning_rate": 5.351794522823195e-11,
"loss": 0.0531,
"step": 1356
},
{
"epoch": 1.9985272459499264,
"grad_norm": 1.9871202396885614,
"learning_rate": 1.3379504207877703e-11,
"loss": 0.0434,
"step": 1357
},
{
"epoch": 2.0,
"grad_norm": 1.9204232674227737,
"learning_rate": 0.0,
"loss": 0.0379,
"step": 1358
},
{
"epoch": 2.0,
"step": 1358,
"total_flos": 2955929518080.0,
"train_loss": 0.13508925529605598,
"train_runtime": 626.3733,
"train_samples_per_second": 17.341,
"train_steps_per_second": 2.168
}
],
"logging_steps": 1,
"max_steps": 1358,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2955929518080.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}