ckptjclothes / checkpoint-2000 /trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
da931a6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5026072752403091,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00025130363762015457,
"grad_norm": 2.0144803524017334,
"learning_rate": 0.0,
"loss": 2.7742,
"step": 1
},
{
"epoch": 0.0005026072752403091,
"grad_norm": 1.5290026664733887,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.9485,
"step": 2
},
{
"epoch": 0.0007539109128604636,
"grad_norm": 1.7576098442077637,
"learning_rate": 4.000000000000001e-06,
"loss": 3.2699,
"step": 3
},
{
"epoch": 0.0010052145504806183,
"grad_norm": 1.4803149700164795,
"learning_rate": 6e-06,
"loss": 2.6531,
"step": 4
},
{
"epoch": 0.0012565181881007727,
"grad_norm": 1.2919175624847412,
"learning_rate": 8.000000000000001e-06,
"loss": 2.7198,
"step": 5
},
{
"epoch": 0.0015078218257209273,
"grad_norm": 1.0887187719345093,
"learning_rate": 1e-05,
"loss": 2.9058,
"step": 6
},
{
"epoch": 0.001759125463341082,
"grad_norm": 1.176196575164795,
"learning_rate": 1.2e-05,
"loss": 2.769,
"step": 7
},
{
"epoch": 0.0020104291009612365,
"grad_norm": 1.4506360292434692,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.8493,
"step": 8
},
{
"epoch": 0.002261732738581391,
"grad_norm": 0.8232998251914978,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.7691,
"step": 9
},
{
"epoch": 0.0025130363762015454,
"grad_norm": 0.8385952711105347,
"learning_rate": 1.8e-05,
"loss": 2.7073,
"step": 10
},
{
"epoch": 0.0027643400138217,
"grad_norm": 0.594434916973114,
"learning_rate": 2e-05,
"loss": 2.1646,
"step": 11
},
{
"epoch": 0.0030156436514418546,
"grad_norm": 0.8067689538002014,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.5741,
"step": 12
},
{
"epoch": 0.0032669472890620092,
"grad_norm": 0.8038071393966675,
"learning_rate": 2.4e-05,
"loss": 2.6733,
"step": 13
},
{
"epoch": 0.003518250926682164,
"grad_norm": 0.48843103647232056,
"learning_rate": 2.6000000000000002e-05,
"loss": 2.598,
"step": 14
},
{
"epoch": 0.0037695545643023185,
"grad_norm": 0.8283182978630066,
"learning_rate": 2.8000000000000003e-05,
"loss": 2.8211,
"step": 15
},
{
"epoch": 0.004020858201922473,
"grad_norm": 0.4953489601612091,
"learning_rate": 3e-05,
"loss": 2.3855,
"step": 16
},
{
"epoch": 0.004272161839542627,
"grad_norm": 0.4421069025993347,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.6665,
"step": 17
},
{
"epoch": 0.004523465477162782,
"grad_norm": 0.6795067191123962,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.5962,
"step": 18
},
{
"epoch": 0.0047747691147829365,
"grad_norm": 0.5411877036094666,
"learning_rate": 3.6e-05,
"loss": 2.4962,
"step": 19
},
{
"epoch": 0.005026072752403091,
"grad_norm": 0.5864161252975464,
"learning_rate": 3.8e-05,
"loss": 2.4219,
"step": 20
},
{
"epoch": 0.005277376390023246,
"grad_norm": 0.5454627871513367,
"learning_rate": 4e-05,
"loss": 2.8872,
"step": 21
},
{
"epoch": 0.0055286800276434,
"grad_norm": 0.4071284532546997,
"learning_rate": 4.2e-05,
"loss": 2.4573,
"step": 22
},
{
"epoch": 0.005779983665263555,
"grad_norm": 0.2068735957145691,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.8764,
"step": 23
},
{
"epoch": 0.006031287302883709,
"grad_norm": 0.5132932662963867,
"learning_rate": 4.600000000000001e-05,
"loss": 2.4306,
"step": 24
},
{
"epoch": 0.006282590940503863,
"grad_norm": 0.3241199851036072,
"learning_rate": 4.8e-05,
"loss": 2.1521,
"step": 25
},
{
"epoch": 0.0065338945781240184,
"grad_norm": 0.4157683551311493,
"learning_rate": 5e-05,
"loss": 2.7139,
"step": 26
},
{
"epoch": 0.006785198215744173,
"grad_norm": 0.5854943990707397,
"learning_rate": 5.2000000000000004e-05,
"loss": 2.2383,
"step": 27
},
{
"epoch": 0.007036501853364328,
"grad_norm": 0.5483741760253906,
"learning_rate": 5.4000000000000005e-05,
"loss": 2.4569,
"step": 28
},
{
"epoch": 0.007287805490984482,
"grad_norm": 0.41493722796440125,
"learning_rate": 5.6000000000000006e-05,
"loss": 2.9473,
"step": 29
},
{
"epoch": 0.007539109128604637,
"grad_norm": 0.3696433901786804,
"learning_rate": 5.8e-05,
"loss": 2.2744,
"step": 30
},
{
"epoch": 0.007790412766224791,
"grad_norm": 0.18437762558460236,
"learning_rate": 6e-05,
"loss": 1.0493,
"step": 31
},
{
"epoch": 0.008041716403844946,
"grad_norm": 0.46951478719711304,
"learning_rate": 6.2e-05,
"loss": 2.4904,
"step": 32
},
{
"epoch": 0.0082930200414651,
"grad_norm": 0.32901376485824585,
"learning_rate": 6.400000000000001e-05,
"loss": 2.0627,
"step": 33
},
{
"epoch": 0.008544323679085255,
"grad_norm": 0.668229877948761,
"learning_rate": 6.6e-05,
"loss": 2.3694,
"step": 34
},
{
"epoch": 0.008795627316705409,
"grad_norm": 0.9680635333061218,
"learning_rate": 6.800000000000001e-05,
"loss": 2.3719,
"step": 35
},
{
"epoch": 0.009046930954325565,
"grad_norm": 0.6750502586364746,
"learning_rate": 7e-05,
"loss": 2.3538,
"step": 36
},
{
"epoch": 0.009298234591945719,
"grad_norm": 0.1759599894285202,
"learning_rate": 7.2e-05,
"loss": 1.0368,
"step": 37
},
{
"epoch": 0.009549538229565873,
"grad_norm": 0.5437096953392029,
"learning_rate": 7.4e-05,
"loss": 2.7952,
"step": 38
},
{
"epoch": 0.009800841867186027,
"grad_norm": 0.31723931431770325,
"learning_rate": 7.6e-05,
"loss": 2.4566,
"step": 39
},
{
"epoch": 0.010052145504806181,
"grad_norm": 0.34227266907691956,
"learning_rate": 7.800000000000001e-05,
"loss": 2.281,
"step": 40
},
{
"epoch": 0.010303449142426337,
"grad_norm": 0.3893303871154785,
"learning_rate": 8e-05,
"loss": 2.7128,
"step": 41
},
{
"epoch": 0.010554752780046492,
"grad_norm": 0.32380804419517517,
"learning_rate": 8.2e-05,
"loss": 2.7928,
"step": 42
},
{
"epoch": 0.010806056417666646,
"grad_norm": 0.29380202293395996,
"learning_rate": 8.4e-05,
"loss": 2.305,
"step": 43
},
{
"epoch": 0.0110573600552868,
"grad_norm": 0.4324714243412018,
"learning_rate": 8.6e-05,
"loss": 2.4824,
"step": 44
},
{
"epoch": 0.011308663692906954,
"grad_norm": 0.4421226978302002,
"learning_rate": 8.800000000000001e-05,
"loss": 2.1697,
"step": 45
},
{
"epoch": 0.01155996733052711,
"grad_norm": 0.5161215662956238,
"learning_rate": 9e-05,
"loss": 2.4065,
"step": 46
},
{
"epoch": 0.011811270968147264,
"grad_norm": 0.3017684817314148,
"learning_rate": 9.200000000000001e-05,
"loss": 2.4247,
"step": 47
},
{
"epoch": 0.012062574605767418,
"grad_norm": 0.5015223622322083,
"learning_rate": 9.4e-05,
"loss": 2.3724,
"step": 48
},
{
"epoch": 0.012313878243387573,
"grad_norm": 0.3141859471797943,
"learning_rate": 9.6e-05,
"loss": 1.8871,
"step": 49
},
{
"epoch": 0.012565181881007727,
"grad_norm": 0.1826818883419037,
"learning_rate": 9.8e-05,
"loss": 1.9633,
"step": 50
},
{
"epoch": 0.012816485518627883,
"grad_norm": 0.37197422981262207,
"learning_rate": 0.0001,
"loss": 2.4161,
"step": 51
},
{
"epoch": 0.013067789156248037,
"grad_norm": 0.3732236623764038,
"learning_rate": 9.99999840163606e-05,
"loss": 2.2943,
"step": 52
},
{
"epoch": 0.013319092793868191,
"grad_norm": 0.5340951681137085,
"learning_rate": 9.99999360654526e-05,
"loss": 2.2039,
"step": 53
},
{
"epoch": 0.013570396431488345,
"grad_norm": 0.394661545753479,
"learning_rate": 9.999985614730664e-05,
"loss": 2.3037,
"step": 54
},
{
"epoch": 0.013821700069108501,
"grad_norm": 0.3042058050632477,
"learning_rate": 9.999974426197384e-05,
"loss": 2.5117,
"step": 55
},
{
"epoch": 0.014073003706728655,
"grad_norm": 0.362106591463089,
"learning_rate": 9.999960040952574e-05,
"loss": 2.6846,
"step": 56
},
{
"epoch": 0.01432430734434881,
"grad_norm": 0.728489875793457,
"learning_rate": 9.99994245900543e-05,
"loss": 2.4936,
"step": 57
},
{
"epoch": 0.014575610981968964,
"grad_norm": 0.2623450756072998,
"learning_rate": 9.999921680367191e-05,
"loss": 2.6129,
"step": 58
},
{
"epoch": 0.014826914619589118,
"grad_norm": 0.24758170545101166,
"learning_rate": 9.999897705051145e-05,
"loss": 2.1818,
"step": 59
},
{
"epoch": 0.015078218257209274,
"grad_norm": 0.3045499324798584,
"learning_rate": 9.99987053307262e-05,
"loss": 2.3918,
"step": 60
},
{
"epoch": 0.015329521894829428,
"grad_norm": 0.3922071158885956,
"learning_rate": 9.999840164448984e-05,
"loss": 2.2996,
"step": 61
},
{
"epoch": 0.015580825532449582,
"grad_norm": 0.20261235535144806,
"learning_rate": 9.999806599199659e-05,
"loss": 1.9872,
"step": 62
},
{
"epoch": 0.015832129170069736,
"grad_norm": 0.3178957402706146,
"learning_rate": 9.999769837346103e-05,
"loss": 2.4412,
"step": 63
},
{
"epoch": 0.016083432807689892,
"grad_norm": 0.304047167301178,
"learning_rate": 9.999729878911816e-05,
"loss": 2.705,
"step": 64
},
{
"epoch": 0.016334736445310045,
"grad_norm": 0.5113909840583801,
"learning_rate": 9.99968672392235e-05,
"loss": 2.533,
"step": 65
},
{
"epoch": 0.0165860400829302,
"grad_norm": 0.29634010791778564,
"learning_rate": 9.999640372405295e-05,
"loss": 2.4283,
"step": 66
},
{
"epoch": 0.016837343720550357,
"grad_norm": 0.3125896453857422,
"learning_rate": 9.999590824390281e-05,
"loss": 2.4421,
"step": 67
},
{
"epoch": 0.01708864735817051,
"grad_norm": 0.38444283604621887,
"learning_rate": 9.999538079908993e-05,
"loss": 2.1928,
"step": 68
},
{
"epoch": 0.017339950995790665,
"grad_norm": 0.3269999325275421,
"learning_rate": 9.999482138995149e-05,
"loss": 2.6259,
"step": 69
},
{
"epoch": 0.017591254633410817,
"grad_norm": 0.30345460772514343,
"learning_rate": 9.999423001684513e-05,
"loss": 2.6393,
"step": 70
},
{
"epoch": 0.017842558271030973,
"grad_norm": 1.1956677436828613,
"learning_rate": 9.9993606680149e-05,
"loss": 2.9455,
"step": 71
},
{
"epoch": 0.01809386190865113,
"grad_norm": 0.4432305097579956,
"learning_rate": 9.999295138026157e-05,
"loss": 2.3158,
"step": 72
},
{
"epoch": 0.018345165546271282,
"grad_norm": 0.24386799335479736,
"learning_rate": 9.999226411760185e-05,
"loss": 2.6777,
"step": 73
},
{
"epoch": 0.018596469183891438,
"grad_norm": 0.34685081243515015,
"learning_rate": 9.99915448926092e-05,
"loss": 2.4423,
"step": 74
},
{
"epoch": 0.01884777282151159,
"grad_norm": 0.5708550214767456,
"learning_rate": 9.999079370574347e-05,
"loss": 2.2399,
"step": 75
},
{
"epoch": 0.019099076459131746,
"grad_norm": 0.2775181829929352,
"learning_rate": 9.999001055748492e-05,
"loss": 1.4073,
"step": 76
},
{
"epoch": 0.019350380096751902,
"grad_norm": 0.44185924530029297,
"learning_rate": 9.998919544833427e-05,
"loss": 2.2447,
"step": 77
},
{
"epoch": 0.019601683734372054,
"grad_norm": 0.35135698318481445,
"learning_rate": 9.998834837881263e-05,
"loss": 2.4701,
"step": 78
},
{
"epoch": 0.01985298737199221,
"grad_norm": 0.2450733631849289,
"learning_rate": 9.998746934946159e-05,
"loss": 2.1066,
"step": 79
},
{
"epoch": 0.020104291009612363,
"grad_norm": 0.3851965367794037,
"learning_rate": 9.998655836084316e-05,
"loss": 2.2371,
"step": 80
},
{
"epoch": 0.02035559464723252,
"grad_norm": 0.2562990188598633,
"learning_rate": 9.998561541353976e-05,
"loss": 2.5131,
"step": 81
},
{
"epoch": 0.020606898284852675,
"grad_norm": 0.35332947969436646,
"learning_rate": 9.998464050815426e-05,
"loss": 2.4474,
"step": 82
},
{
"epoch": 0.020858201922472827,
"grad_norm": 0.3566109836101532,
"learning_rate": 9.998363364530998e-05,
"loss": 2.5007,
"step": 83
},
{
"epoch": 0.021109505560092983,
"grad_norm": 0.27204859256744385,
"learning_rate": 9.998259482565063e-05,
"loss": 2.221,
"step": 84
},
{
"epoch": 0.021360809197713135,
"grad_norm": 0.1774623841047287,
"learning_rate": 9.998152404984036e-05,
"loss": 1.5693,
"step": 85
},
{
"epoch": 0.02161211283533329,
"grad_norm": 0.31386587023735046,
"learning_rate": 9.998042131856382e-05,
"loss": 2.1838,
"step": 86
},
{
"epoch": 0.021863416472953447,
"grad_norm": 0.4387069344520569,
"learning_rate": 9.997928663252601e-05,
"loss": 2.2176,
"step": 87
},
{
"epoch": 0.0221147201105736,
"grad_norm": 0.5427992343902588,
"learning_rate": 9.997811999245236e-05,
"loss": 2.2632,
"step": 88
},
{
"epoch": 0.022366023748193756,
"grad_norm": 0.14437389373779297,
"learning_rate": 9.997692139908879e-05,
"loss": 1.0033,
"step": 89
},
{
"epoch": 0.022617327385813908,
"grad_norm": 0.18704228103160858,
"learning_rate": 9.99756908532016e-05,
"loss": 1.1516,
"step": 90
},
{
"epoch": 0.022868631023434064,
"grad_norm": 0.23126636445522308,
"learning_rate": 9.997442835557753e-05,
"loss": 1.9892,
"step": 91
},
{
"epoch": 0.02311993466105422,
"grad_norm": 0.2967139780521393,
"learning_rate": 9.997313390702377e-05,
"loss": 2.0742,
"step": 92
},
{
"epoch": 0.023371238298674372,
"grad_norm": 0.25964412093162537,
"learning_rate": 9.997180750836792e-05,
"loss": 2.4486,
"step": 93
},
{
"epoch": 0.02362254193629453,
"grad_norm": 0.21999791264533997,
"learning_rate": 9.997044916045799e-05,
"loss": 1.0311,
"step": 94
},
{
"epoch": 0.02387384557391468,
"grad_norm": 0.22438450157642365,
"learning_rate": 9.996905886416244e-05,
"loss": 1.2788,
"step": 95
},
{
"epoch": 0.024125149211534837,
"grad_norm": 0.3627947270870209,
"learning_rate": 9.996763662037014e-05,
"loss": 2.2382,
"step": 96
},
{
"epoch": 0.024376452849154993,
"grad_norm": 0.2582647502422333,
"learning_rate": 9.996618242999042e-05,
"loss": 2.2999,
"step": 97
},
{
"epoch": 0.024627756486775145,
"grad_norm": 0.34755995869636536,
"learning_rate": 9.9964696293953e-05,
"loss": 2.2329,
"step": 98
},
{
"epoch": 0.0248790601243953,
"grad_norm": 0.15312296152114868,
"learning_rate": 9.996317821320802e-05,
"loss": 0.6043,
"step": 99
},
{
"epoch": 0.025130363762015454,
"grad_norm": 0.22593067586421967,
"learning_rate": 9.996162818872607e-05,
"loss": 2.3993,
"step": 100
},
{
"epoch": 0.02538166739963561,
"grad_norm": 0.3924887776374817,
"learning_rate": 9.996004622149814e-05,
"loss": 2.3133,
"step": 101
},
{
"epoch": 0.025632971037255765,
"grad_norm": 0.5138020515441895,
"learning_rate": 9.995843231253569e-05,
"loss": 2.4782,
"step": 102
},
{
"epoch": 0.025884274674875918,
"grad_norm": 0.42053958773612976,
"learning_rate": 9.995678646287053e-05,
"loss": 2.0569,
"step": 103
},
{
"epoch": 0.026135578312496074,
"grad_norm": 0.2818872034549713,
"learning_rate": 9.995510867355494e-05,
"loss": 2.7051,
"step": 104
},
{
"epoch": 0.02638688195011623,
"grad_norm": 0.4185803532600403,
"learning_rate": 9.995339894566158e-05,
"loss": 2.2182,
"step": 105
},
{
"epoch": 0.026638185587736382,
"grad_norm": 0.3512636721134186,
"learning_rate": 9.995165728028359e-05,
"loss": 2.5701,
"step": 106
},
{
"epoch": 0.026889489225356538,
"grad_norm": 0.3916986584663391,
"learning_rate": 9.994988367853451e-05,
"loss": 2.034,
"step": 107
},
{
"epoch": 0.02714079286297669,
"grad_norm": 0.3517094850540161,
"learning_rate": 9.994807814154824e-05,
"loss": 2.0577,
"step": 108
},
{
"epoch": 0.027392096500596846,
"grad_norm": 0.3244116008281708,
"learning_rate": 9.994624067047917e-05,
"loss": 2.1661,
"step": 109
},
{
"epoch": 0.027643400138217002,
"grad_norm": 0.42575743794441223,
"learning_rate": 9.994437126650207e-05,
"loss": 2.5129,
"step": 110
},
{
"epoch": 0.027894703775837155,
"grad_norm": 0.1691725254058838,
"learning_rate": 9.994246993081213e-05,
"loss": 1.5939,
"step": 111
},
{
"epoch": 0.02814600741345731,
"grad_norm": 0.33062514662742615,
"learning_rate": 9.994053666462498e-05,
"loss": 2.222,
"step": 112
},
{
"epoch": 0.028397311051077463,
"grad_norm": 0.3275945484638214,
"learning_rate": 9.993857146917662e-05,
"loss": 2.3336,
"step": 113
},
{
"epoch": 0.02864861468869762,
"grad_norm": 0.3942461907863617,
"learning_rate": 9.993657434572353e-05,
"loss": 2.3289,
"step": 114
},
{
"epoch": 0.028899918326317775,
"grad_norm": 0.3180122971534729,
"learning_rate": 9.993454529554251e-05,
"loss": 2.4353,
"step": 115
},
{
"epoch": 0.029151221963937927,
"grad_norm": 0.7072325944900513,
"learning_rate": 9.993248431993086e-05,
"loss": 1.9157,
"step": 116
},
{
"epoch": 0.029402525601558083,
"grad_norm": 0.32835853099823,
"learning_rate": 9.993039142020622e-05,
"loss": 2.8819,
"step": 117
},
{
"epoch": 0.029653829239178236,
"grad_norm": 0.2762772738933563,
"learning_rate": 9.992826659770672e-05,
"loss": 2.504,
"step": 118
},
{
"epoch": 0.029905132876798392,
"grad_norm": 0.2669197618961334,
"learning_rate": 9.992610985379082e-05,
"loss": 2.3047,
"step": 119
},
{
"epoch": 0.030156436514418548,
"grad_norm": 0.47167858481407166,
"learning_rate": 9.992392118983746e-05,
"loss": 2.2432,
"step": 120
},
{
"epoch": 0.0304077401520387,
"grad_norm": 0.18089920282363892,
"learning_rate": 9.992170060724593e-05,
"loss": 1.5848,
"step": 121
},
{
"epoch": 0.030659043789658856,
"grad_norm": 0.33109933137893677,
"learning_rate": 9.991944810743597e-05,
"loss": 2.3503,
"step": 122
},
{
"epoch": 0.03091034742727901,
"grad_norm": 0.24957267940044403,
"learning_rate": 9.991716369184765e-05,
"loss": 2.2894,
"step": 123
},
{
"epoch": 0.031161651064899164,
"grad_norm": 0.3461008667945862,
"learning_rate": 9.991484736194157e-05,
"loss": 2.2987,
"step": 124
},
{
"epoch": 0.03141295470251932,
"grad_norm": 0.2855774760246277,
"learning_rate": 9.991249911919862e-05,
"loss": 2.5193,
"step": 125
},
{
"epoch": 0.03166425834013947,
"grad_norm": 0.30880868434906006,
"learning_rate": 9.991011896512016e-05,
"loss": 2.5596,
"step": 126
},
{
"epoch": 0.031915561977759625,
"grad_norm": 0.4237726032733917,
"learning_rate": 9.990770690122793e-05,
"loss": 2.1401,
"step": 127
},
{
"epoch": 0.032166865615379785,
"grad_norm": 0.2743227183818817,
"learning_rate": 9.990526292906405e-05,
"loss": 1.858,
"step": 128
},
{
"epoch": 0.03241816925299994,
"grad_norm": 0.28934118151664734,
"learning_rate": 9.99027870501911e-05,
"loss": 1.9751,
"step": 129
},
{
"epoch": 0.03266947289062009,
"grad_norm": 0.4869619905948639,
"learning_rate": 9.990027926619197e-05,
"loss": 2.7332,
"step": 130
},
{
"epoch": 0.03292077652824025,
"grad_norm": 0.2246621996164322,
"learning_rate": 9.989773957867006e-05,
"loss": 2.2951,
"step": 131
},
{
"epoch": 0.0331720801658604,
"grad_norm": 0.46311891078948975,
"learning_rate": 9.989516798924908e-05,
"loss": 2.0357,
"step": 132
},
{
"epoch": 0.033423383803480554,
"grad_norm": 0.45459482073783875,
"learning_rate": 9.989256449957316e-05,
"loss": 2.2661,
"step": 133
},
{
"epoch": 0.03367468744110071,
"grad_norm": 0.4114730656147003,
"learning_rate": 9.988992911130683e-05,
"loss": 2.096,
"step": 134
},
{
"epoch": 0.033925991078720866,
"grad_norm": 0.468787282705307,
"learning_rate": 9.988726182613502e-05,
"loss": 2.2083,
"step": 135
},
{
"epoch": 0.03417729471634102,
"grad_norm": 0.30824795365333557,
"learning_rate": 9.988456264576305e-05,
"loss": 2.4525,
"step": 136
},
{
"epoch": 0.03442859835396117,
"grad_norm": 0.22959741950035095,
"learning_rate": 9.988183157191662e-05,
"loss": 2.4861,
"step": 137
},
{
"epoch": 0.03467990199158133,
"grad_norm": 0.37763601541519165,
"learning_rate": 9.987906860634184e-05,
"loss": 2.4736,
"step": 138
},
{
"epoch": 0.03493120562920148,
"grad_norm": 0.3474363386631012,
"learning_rate": 9.987627375080519e-05,
"loss": 2.4363,
"step": 139
},
{
"epoch": 0.035182509266821635,
"grad_norm": 0.3558574616909027,
"learning_rate": 9.987344700709356e-05,
"loss": 2.0973,
"step": 140
},
{
"epoch": 0.035433812904441794,
"grad_norm": 0.19336606562137604,
"learning_rate": 9.98705883770142e-05,
"loss": 1.3915,
"step": 141
},
{
"epoch": 0.03568511654206195,
"grad_norm": 0.4760534465312958,
"learning_rate": 9.986769786239477e-05,
"loss": 2.0163,
"step": 142
},
{
"epoch": 0.0359364201796821,
"grad_norm": 0.2839144170284271,
"learning_rate": 9.98647754650833e-05,
"loss": 1.7691,
"step": 143
},
{
"epoch": 0.03618772381730226,
"grad_norm": 0.3936741352081299,
"learning_rate": 9.986182118694825e-05,
"loss": 2.1423,
"step": 144
},
{
"epoch": 0.03643902745492241,
"grad_norm": 0.35313600301742554,
"learning_rate": 9.985883502987838e-05,
"loss": 2.3156,
"step": 145
},
{
"epoch": 0.036690331092542564,
"grad_norm": 0.3352813720703125,
"learning_rate": 9.985581699578287e-05,
"loss": 2.2992,
"step": 146
},
{
"epoch": 0.036941634730162716,
"grad_norm": 0.43075451254844666,
"learning_rate": 9.985276708659134e-05,
"loss": 2.7181,
"step": 147
},
{
"epoch": 0.037192938367782875,
"grad_norm": 0.4140123128890991,
"learning_rate": 9.984968530425369e-05,
"loss": 2.2454,
"step": 148
},
{
"epoch": 0.03744424200540303,
"grad_norm": 0.31063738465309143,
"learning_rate": 9.984657165074027e-05,
"loss": 2.4994,
"step": 149
},
{
"epoch": 0.03769554564302318,
"grad_norm": 0.22530898451805115,
"learning_rate": 9.984342612804176e-05,
"loss": 2.0645,
"step": 150
},
{
"epoch": 0.03794684928064334,
"grad_norm": 0.2811500132083893,
"learning_rate": 9.984024873816924e-05,
"loss": 2.3546,
"step": 151
},
{
"epoch": 0.03819815291826349,
"grad_norm": 0.4520877003669739,
"learning_rate": 9.983703948315417e-05,
"loss": 2.1208,
"step": 152
},
{
"epoch": 0.038449456555883645,
"grad_norm": 0.37516894936561584,
"learning_rate": 9.983379836504838e-05,
"loss": 2.3911,
"step": 153
},
{
"epoch": 0.038700760193503804,
"grad_norm": 0.4740954637527466,
"learning_rate": 9.983052538592404e-05,
"loss": 2.4538,
"step": 154
},
{
"epoch": 0.038952063831123956,
"grad_norm": 0.34146809577941895,
"learning_rate": 9.982722054787372e-05,
"loss": 2.5654,
"step": 155
},
{
"epoch": 0.03920336746874411,
"grad_norm": 0.3912610709667206,
"learning_rate": 9.982388385301038e-05,
"loss": 2.4403,
"step": 156
},
{
"epoch": 0.03945467110636426,
"grad_norm": 0.5822238922119141,
"learning_rate": 9.98205153034673e-05,
"loss": 2.3008,
"step": 157
},
{
"epoch": 0.03970597474398442,
"grad_norm": 0.1491788774728775,
"learning_rate": 9.981711490139814e-05,
"loss": 1.1215,
"step": 158
},
{
"epoch": 0.03995727838160457,
"grad_norm": 0.3438681960105896,
"learning_rate": 9.981368264897694e-05,
"loss": 2.7011,
"step": 159
},
{
"epoch": 0.040208582019224726,
"grad_norm": 0.3378736972808838,
"learning_rate": 9.98102185483981e-05,
"loss": 2.4961,
"step": 160
},
{
"epoch": 0.040459885656844885,
"grad_norm": 0.2592676281929016,
"learning_rate": 9.980672260187638e-05,
"loss": 1.8838,
"step": 161
},
{
"epoch": 0.04071118929446504,
"grad_norm": 0.31087106466293335,
"learning_rate": 9.980319481164688e-05,
"loss": 2.2719,
"step": 162
},
{
"epoch": 0.04096249293208519,
"grad_norm": 0.33119067549705505,
"learning_rate": 9.979963517996509e-05,
"loss": 2.0298,
"step": 163
},
{
"epoch": 0.04121379656970535,
"grad_norm": 0.4013742506504059,
"learning_rate": 9.979604370910685e-05,
"loss": 2.5852,
"step": 164
},
{
"epoch": 0.0414651002073255,
"grad_norm": 0.21013818681240082,
"learning_rate": 9.979242040136835e-05,
"loss": 1.1489,
"step": 165
},
{
"epoch": 0.041716403844945654,
"grad_norm": 0.5467535257339478,
"learning_rate": 9.978876525906613e-05,
"loss": 2.2094,
"step": 166
},
{
"epoch": 0.04196770748256581,
"grad_norm": 0.3550753593444824,
"learning_rate": 9.978507828453708e-05,
"loss": 2.2849,
"step": 167
},
{
"epoch": 0.042219011120185966,
"grad_norm": 0.2620997726917267,
"learning_rate": 9.978135948013847e-05,
"loss": 2.3229,
"step": 168
},
{
"epoch": 0.04247031475780612,
"grad_norm": 0.2706509530544281,
"learning_rate": 9.977760884824788e-05,
"loss": 2.2731,
"step": 169
},
{
"epoch": 0.04272161839542627,
"grad_norm": 0.41886723041534424,
"learning_rate": 9.977382639126328e-05,
"loss": 2.42,
"step": 170
},
{
"epoch": 0.04297292203304643,
"grad_norm": 0.33128440380096436,
"learning_rate": 9.977001211160296e-05,
"loss": 2.3203,
"step": 171
},
{
"epoch": 0.04322422567066658,
"grad_norm": 0.24908378720283508,
"learning_rate": 9.976616601170557e-05,
"loss": 1.8275,
"step": 172
},
{
"epoch": 0.043475529308286735,
"grad_norm": 0.3278106451034546,
"learning_rate": 9.976228809403008e-05,
"loss": 2.6484,
"step": 173
},
{
"epoch": 0.043726832945906895,
"grad_norm": 0.5737075209617615,
"learning_rate": 9.975837836105581e-05,
"loss": 2.2196,
"step": 174
},
{
"epoch": 0.04397813658352705,
"grad_norm": 0.3396255373954773,
"learning_rate": 9.975443681528247e-05,
"loss": 2.1756,
"step": 175
},
{
"epoch": 0.0442294402211472,
"grad_norm": 0.4173082411289215,
"learning_rate": 9.975046345923004e-05,
"loss": 1.9394,
"step": 176
},
{
"epoch": 0.04448074385876736,
"grad_norm": 0.2630142867565155,
"learning_rate": 9.974645829543889e-05,
"loss": 1.9798,
"step": 177
},
{
"epoch": 0.04473204749638751,
"grad_norm": 0.23479297757148743,
"learning_rate": 9.974242132646967e-05,
"loss": 1.8764,
"step": 178
},
{
"epoch": 0.044983351134007664,
"grad_norm": 0.2806346118450165,
"learning_rate": 9.973835255490343e-05,
"loss": 2.2942,
"step": 179
},
{
"epoch": 0.045234654771627816,
"grad_norm": 0.2745998203754425,
"learning_rate": 9.97342519833415e-05,
"loss": 2.1606,
"step": 180
},
{
"epoch": 0.045485958409247976,
"grad_norm": 0.46454527974128723,
"learning_rate": 9.973011961440559e-05,
"loss": 2.2312,
"step": 181
},
{
"epoch": 0.04573726204686813,
"grad_norm": 0.34017616510391235,
"learning_rate": 9.972595545073769e-05,
"loss": 2.3171,
"step": 182
},
{
"epoch": 0.04598856568448828,
"grad_norm": 0.34439125657081604,
"learning_rate": 9.972175949500012e-05,
"loss": 2.3098,
"step": 183
},
{
"epoch": 0.04623986932210844,
"grad_norm": 0.47229647636413574,
"learning_rate": 9.97175317498756e-05,
"loss": 2.1515,
"step": 184
},
{
"epoch": 0.04649117295972859,
"grad_norm": 0.3148500621318817,
"learning_rate": 9.971327221806706e-05,
"loss": 2.5827,
"step": 185
},
{
"epoch": 0.046742476597348745,
"grad_norm": 0.20196139812469482,
"learning_rate": 9.970898090229785e-05,
"loss": 1.5597,
"step": 186
},
{
"epoch": 0.046993780234968904,
"grad_norm": 0.26498332619667053,
"learning_rate": 9.97046578053116e-05,
"loss": 2.4454,
"step": 187
},
{
"epoch": 0.04724508387258906,
"grad_norm": 0.4051288068294525,
"learning_rate": 9.970030292987225e-05,
"loss": 2.3758,
"step": 188
},
{
"epoch": 0.04749638751020921,
"grad_norm": 0.3003399074077606,
"learning_rate": 9.969591627876409e-05,
"loss": 2.691,
"step": 189
},
{
"epoch": 0.04774769114782936,
"grad_norm": 0.22101780772209167,
"learning_rate": 9.96914978547917e-05,
"loss": 2.361,
"step": 190
},
{
"epoch": 0.04799899478544952,
"grad_norm": 0.342926025390625,
"learning_rate": 9.968704766077997e-05,
"loss": 2.4845,
"step": 191
},
{
"epoch": 0.048250298423069674,
"grad_norm": 0.3619941174983978,
"learning_rate": 9.968256569957411e-05,
"loss": 2.6731,
"step": 192
},
{
"epoch": 0.048501602060689826,
"grad_norm": 0.4262664020061493,
"learning_rate": 9.967805197403965e-05,
"loss": 2.6423,
"step": 193
},
{
"epoch": 0.048752905698309985,
"grad_norm": 0.21453841030597687,
"learning_rate": 9.96735064870624e-05,
"loss": 1.0993,
"step": 194
},
{
"epoch": 0.04900420933593014,
"grad_norm": 0.5756468176841736,
"learning_rate": 9.966892924154853e-05,
"loss": 2.1122,
"step": 195
},
{
"epoch": 0.04925551297355029,
"grad_norm": 0.5691533088684082,
"learning_rate": 9.96643202404245e-05,
"loss": 2.3865,
"step": 196
},
{
"epoch": 0.04950681661117045,
"grad_norm": 0.4420826733112335,
"learning_rate": 9.965967948663698e-05,
"loss": 2.1786,
"step": 197
},
{
"epoch": 0.0497581202487906,
"grad_norm": 0.4547330141067505,
"learning_rate": 9.965500698315306e-05,
"loss": 2.5167,
"step": 198
},
{
"epoch": 0.050009423886410755,
"grad_norm": 0.18379782140254974,
"learning_rate": 9.96503027329601e-05,
"loss": 1.0926,
"step": 199
},
{
"epoch": 0.05026072752403091,
"grad_norm": 0.20280440151691437,
"learning_rate": 9.964556673906572e-05,
"loss": 2.0984,
"step": 200
},
{
"epoch": 0.050512031161651066,
"grad_norm": 0.5554583072662354,
"learning_rate": 9.964079900449785e-05,
"loss": 2.8414,
"step": 201
},
{
"epoch": 0.05076333479927122,
"grad_norm": 0.3430071473121643,
"learning_rate": 9.963599953230473e-05,
"loss": 2.4415,
"step": 202
},
{
"epoch": 0.05101463843689137,
"grad_norm": 0.34130024909973145,
"learning_rate": 9.96311683255549e-05,
"loss": 2.1872,
"step": 203
},
{
"epoch": 0.05126594207451153,
"grad_norm": 0.3914225995540619,
"learning_rate": 9.962630538733715e-05,
"loss": 2.3896,
"step": 204
},
{
"epoch": 0.05151724571213168,
"grad_norm": 0.6636744141578674,
"learning_rate": 9.962141072076057e-05,
"loss": 2.5256,
"step": 205
},
{
"epoch": 0.051768549349751836,
"grad_norm": 0.419360876083374,
"learning_rate": 9.961648432895454e-05,
"loss": 2.1897,
"step": 206
},
{
"epoch": 0.052019852987371995,
"grad_norm": 0.3426320552825928,
"learning_rate": 9.961152621506876e-05,
"loss": 2.433,
"step": 207
},
{
"epoch": 0.05227115662499215,
"grad_norm": 0.24097320437431335,
"learning_rate": 9.960653638227315e-05,
"loss": 1.899,
"step": 208
},
{
"epoch": 0.0525224602626123,
"grad_norm": 0.40818917751312256,
"learning_rate": 9.960151483375795e-05,
"loss": 2.4313,
"step": 209
},
{
"epoch": 0.05277376390023246,
"grad_norm": 0.34848183393478394,
"learning_rate": 9.959646157273366e-05,
"loss": 1.9527,
"step": 210
},
{
"epoch": 0.05302506753785261,
"grad_norm": 0.18201400339603424,
"learning_rate": 9.959137660243105e-05,
"loss": 1.8784,
"step": 211
},
{
"epoch": 0.053276371175472764,
"grad_norm": 0.3388751745223999,
"learning_rate": 9.95862599261012e-05,
"loss": 2.4083,
"step": 212
},
{
"epoch": 0.05352767481309292,
"grad_norm": 0.2567930519580841,
"learning_rate": 9.958111154701542e-05,
"loss": 1.9071,
"step": 213
},
{
"epoch": 0.053778978450713076,
"grad_norm": 0.39157772064208984,
"learning_rate": 9.957593146846529e-05,
"loss": 2.072,
"step": 214
},
{
"epoch": 0.05403028208833323,
"grad_norm": 0.40348386764526367,
"learning_rate": 9.95707196937627e-05,
"loss": 2.427,
"step": 215
},
{
"epoch": 0.05428158572595338,
"grad_norm": 0.47843194007873535,
"learning_rate": 9.956547622623973e-05,
"loss": 2.0144,
"step": 216
},
{
"epoch": 0.05453288936357354,
"grad_norm": 0.337272971868515,
"learning_rate": 9.956020106924882e-05,
"loss": 1.7783,
"step": 217
},
{
"epoch": 0.05478419300119369,
"grad_norm": 0.40009111166000366,
"learning_rate": 9.955489422616258e-05,
"loss": 2.2961,
"step": 218
},
{
"epoch": 0.055035496638813845,
"grad_norm": 0.28611692786216736,
"learning_rate": 9.954955570037395e-05,
"loss": 2.44,
"step": 219
},
{
"epoch": 0.055286800276434005,
"grad_norm": 0.3352760374546051,
"learning_rate": 9.954418549529605e-05,
"loss": 2.2449,
"step": 220
},
{
"epoch": 0.05553810391405416,
"grad_norm": 0.295691579580307,
"learning_rate": 9.953878361436232e-05,
"loss": 1.9398,
"step": 221
},
{
"epoch": 0.05578940755167431,
"grad_norm": 0.41174155473709106,
"learning_rate": 9.953335006102643e-05,
"loss": 2.241,
"step": 222
},
{
"epoch": 0.05604071118929446,
"grad_norm": 0.32655176520347595,
"learning_rate": 9.95278848387623e-05,
"loss": 2.078,
"step": 223
},
{
"epoch": 0.05629201482691462,
"grad_norm": 0.26872923970222473,
"learning_rate": 9.95223879510641e-05,
"loss": 1.9153,
"step": 224
},
{
"epoch": 0.056543318464534774,
"grad_norm": 0.37920647859573364,
"learning_rate": 9.951685940144622e-05,
"loss": 2.5823,
"step": 225
},
{
"epoch": 0.056794622102154926,
"grad_norm": 0.18780489265918732,
"learning_rate": 9.951129919344334e-05,
"loss": 0.9544,
"step": 226
},
{
"epoch": 0.057045925739775086,
"grad_norm": 0.3492078185081482,
"learning_rate": 9.950570733061033e-05,
"loss": 2.3073,
"step": 227
},
{
"epoch": 0.05729722937739524,
"grad_norm": 0.29748043417930603,
"learning_rate": 9.950008381652235e-05,
"loss": 1.6764,
"step": 228
},
{
"epoch": 0.05754853301501539,
"grad_norm": 0.30301064252853394,
"learning_rate": 9.949442865477474e-05,
"loss": 2.4839,
"step": 229
},
{
"epoch": 0.05779983665263555,
"grad_norm": 1.056869387626648,
"learning_rate": 9.948874184898313e-05,
"loss": 1.9098,
"step": 230
},
{
"epoch": 0.0580511402902557,
"grad_norm": 0.33054810762405396,
"learning_rate": 9.948302340278333e-05,
"loss": 2.6708,
"step": 231
},
{
"epoch": 0.058302443927875855,
"grad_norm": 0.37433305382728577,
"learning_rate": 9.94772733198314e-05,
"loss": 2.4231,
"step": 232
},
{
"epoch": 0.05855374756549601,
"grad_norm": 0.35789650678634644,
"learning_rate": 9.947149160380366e-05,
"loss": 2.1926,
"step": 233
},
{
"epoch": 0.05880505120311617,
"grad_norm": 0.3637252151966095,
"learning_rate": 9.94656782583966e-05,
"loss": 2.0955,
"step": 234
},
{
"epoch": 0.05905635484073632,
"grad_norm": 0.37392285466194153,
"learning_rate": 9.945983328732698e-05,
"loss": 2.2662,
"step": 235
},
{
"epoch": 0.05930765847835647,
"grad_norm": 0.36526796221733093,
"learning_rate": 9.945395669433172e-05,
"loss": 2.4013,
"step": 236
},
{
"epoch": 0.05955896211597663,
"grad_norm": 0.3178480267524719,
"learning_rate": 9.944804848316802e-05,
"loss": 2.4751,
"step": 237
},
{
"epoch": 0.059810265753596784,
"grad_norm": 0.5924585461616516,
"learning_rate": 9.944210865761328e-05,
"loss": 2.4306,
"step": 238
},
{
"epoch": 0.060061569391216936,
"grad_norm": 0.19464784860610962,
"learning_rate": 9.943613722146505e-05,
"loss": 1.7291,
"step": 239
},
{
"epoch": 0.060312873028837095,
"grad_norm": 0.43970218300819397,
"learning_rate": 9.943013417854122e-05,
"loss": 2.6384,
"step": 240
},
{
"epoch": 0.06056417666645725,
"grad_norm": 0.24550621211528778,
"learning_rate": 9.942409953267972e-05,
"loss": 1.5974,
"step": 241
},
{
"epoch": 0.0608154803040774,
"grad_norm": 0.33134451508522034,
"learning_rate": 9.941803328773885e-05,
"loss": 2.4358,
"step": 242
},
{
"epoch": 0.06106678394169755,
"grad_norm": 0.5241190791130066,
"learning_rate": 9.941193544759699e-05,
"loss": 2.3483,
"step": 243
},
{
"epoch": 0.06131808757931771,
"grad_norm": 0.5123705863952637,
"learning_rate": 9.940580601615279e-05,
"loss": 1.9421,
"step": 244
},
{
"epoch": 0.061569391216937865,
"grad_norm": 0.3747979998588562,
"learning_rate": 9.939964499732507e-05,
"loss": 2.207,
"step": 245
},
{
"epoch": 0.06182069485455802,
"grad_norm": 0.3442586362361908,
"learning_rate": 9.939345239505284e-05,
"loss": 2.2754,
"step": 246
},
{
"epoch": 0.062071998492178176,
"grad_norm": 0.3288307785987854,
"learning_rate": 9.938722821329532e-05,
"loss": 2.3217,
"step": 247
},
{
"epoch": 0.06232330212979833,
"grad_norm": 0.33456501364707947,
"learning_rate": 9.938097245603193e-05,
"loss": 2.1507,
"step": 248
},
{
"epoch": 0.06257460576741848,
"grad_norm": 0.4627265930175781,
"learning_rate": 9.937468512726223e-05,
"loss": 2.4072,
"step": 249
},
{
"epoch": 0.06282590940503864,
"grad_norm": 0.2955438196659088,
"learning_rate": 9.9368366231006e-05,
"loss": 2.452,
"step": 250
},
{
"epoch": 0.06307721304265879,
"grad_norm": 0.7906093001365662,
"learning_rate": 9.936201577130324e-05,
"loss": 2.1179,
"step": 251
},
{
"epoch": 0.06332851668027895,
"grad_norm": 0.41081616282463074,
"learning_rate": 9.935563375221404e-05,
"loss": 2.0524,
"step": 252
},
{
"epoch": 0.0635798203178991,
"grad_norm": 0.18861036002635956,
"learning_rate": 9.934922017781873e-05,
"loss": 1.4761,
"step": 253
},
{
"epoch": 0.06383112395551925,
"grad_norm": 0.27071478962898254,
"learning_rate": 9.93427750522178e-05,
"loss": 2.2497,
"step": 254
},
{
"epoch": 0.06408242759313941,
"grad_norm": 0.335267573595047,
"learning_rate": 9.933629837953191e-05,
"loss": 2.3577,
"step": 255
},
{
"epoch": 0.06433373123075957,
"grad_norm": 0.3902337849140167,
"learning_rate": 9.932979016390192e-05,
"loss": 2.2602,
"step": 256
},
{
"epoch": 0.06458503486837971,
"grad_norm": 0.550452470779419,
"learning_rate": 9.932325040948878e-05,
"loss": 2.1322,
"step": 257
},
{
"epoch": 0.06483633850599987,
"grad_norm": 0.42229825258255005,
"learning_rate": 9.93166791204737e-05,
"loss": 2.449,
"step": 258
},
{
"epoch": 0.06508764214362003,
"grad_norm": 0.2767508924007416,
"learning_rate": 9.931007630105798e-05,
"loss": 2.552,
"step": 259
},
{
"epoch": 0.06533894578124018,
"grad_norm": 0.3925112783908844,
"learning_rate": 9.93034419554631e-05,
"loss": 2.5078,
"step": 260
},
{
"epoch": 0.06559024941886034,
"grad_norm": 0.38622385263442993,
"learning_rate": 9.929677608793072e-05,
"loss": 2.5889,
"step": 261
},
{
"epoch": 0.0658415530564805,
"grad_norm": 0.3186779022216797,
"learning_rate": 9.929007870272262e-05,
"loss": 2.3722,
"step": 262
},
{
"epoch": 0.06609285669410064,
"grad_norm": 0.308403342962265,
"learning_rate": 9.928334980412073e-05,
"loss": 2.8672,
"step": 263
},
{
"epoch": 0.0663441603317208,
"grad_norm": 0.37506890296936035,
"learning_rate": 9.927658939642716e-05,
"loss": 2.3802,
"step": 264
},
{
"epoch": 0.06659546396934096,
"grad_norm": 0.43554747104644775,
"learning_rate": 9.926979748396415e-05,
"loss": 2.1515,
"step": 265
},
{
"epoch": 0.06684676760696111,
"grad_norm": 0.4737273156642914,
"learning_rate": 9.926297407107406e-05,
"loss": 2.4804,
"step": 266
},
{
"epoch": 0.06709807124458127,
"grad_norm": 0.30481746792793274,
"learning_rate": 9.925611916211943e-05,
"loss": 2.5455,
"step": 267
},
{
"epoch": 0.06734937488220143,
"grad_norm": 0.29122671484947205,
"learning_rate": 9.92492327614829e-05,
"loss": 1.9563,
"step": 268
},
{
"epoch": 0.06760067851982157,
"grad_norm": 0.2516386806964874,
"learning_rate": 9.924231487356725e-05,
"loss": 2.1242,
"step": 269
},
{
"epoch": 0.06785198215744173,
"grad_norm": 0.1990005224943161,
"learning_rate": 9.923536550279544e-05,
"loss": 1.6236,
"step": 270
},
{
"epoch": 0.06810328579506188,
"grad_norm": 0.3531213402748108,
"learning_rate": 9.92283846536105e-05,
"loss": 2.3706,
"step": 271
},
{
"epoch": 0.06835458943268204,
"grad_norm": 0.358190655708313,
"learning_rate": 9.922137233047558e-05,
"loss": 2.3467,
"step": 272
},
{
"epoch": 0.0686058930703022,
"grad_norm": 0.3931328058242798,
"learning_rate": 9.9214328537874e-05,
"loss": 2.359,
"step": 273
},
{
"epoch": 0.06885719670792234,
"grad_norm": 0.30483195185661316,
"learning_rate": 9.92072532803092e-05,
"loss": 2.8789,
"step": 274
},
{
"epoch": 0.0691085003455425,
"grad_norm": 0.155193030834198,
"learning_rate": 9.920014656230468e-05,
"loss": 1.4218,
"step": 275
},
{
"epoch": 0.06935980398316266,
"grad_norm": 0.2718715965747833,
"learning_rate": 9.919300838840409e-05,
"loss": 2.6687,
"step": 276
},
{
"epoch": 0.0696111076207828,
"grad_norm": 0.45890912413597107,
"learning_rate": 9.91858387631712e-05,
"loss": 2.4566,
"step": 277
},
{
"epoch": 0.06986241125840296,
"grad_norm": 0.39152559638023376,
"learning_rate": 9.917863769118988e-05,
"loss": 2.7171,
"step": 278
},
{
"epoch": 0.07011371489602312,
"grad_norm": 0.423405259847641,
"learning_rate": 9.91714051770641e-05,
"loss": 2.4389,
"step": 279
},
{
"epoch": 0.07036501853364327,
"grad_norm": 0.2914890944957733,
"learning_rate": 9.916414122541794e-05,
"loss": 2.8702,
"step": 280
},
{
"epoch": 0.07061632217126343,
"grad_norm": 0.45607659220695496,
"learning_rate": 9.915684584089557e-05,
"loss": 2.2563,
"step": 281
},
{
"epoch": 0.07086762580888359,
"grad_norm": 0.8603537678718567,
"learning_rate": 9.914951902816128e-05,
"loss": 2.0017,
"step": 282
},
{
"epoch": 0.07111892944650373,
"grad_norm": 0.34362930059432983,
"learning_rate": 9.91421607918994e-05,
"loss": 2.5167,
"step": 283
},
{
"epoch": 0.0713702330841239,
"grad_norm": 0.3992220163345337,
"learning_rate": 9.913477113681441e-05,
"loss": 2.1986,
"step": 284
},
{
"epoch": 0.07162153672174405,
"grad_norm": 0.409201443195343,
"learning_rate": 9.912735006763085e-05,
"loss": 1.8954,
"step": 285
},
{
"epoch": 0.0718728403593642,
"grad_norm": 0.293445348739624,
"learning_rate": 9.911989758909335e-05,
"loss": 2.3246,
"step": 286
},
{
"epoch": 0.07212414399698436,
"grad_norm": 0.3958424925804138,
"learning_rate": 9.911241370596663e-05,
"loss": 2.35,
"step": 287
},
{
"epoch": 0.07237544763460452,
"grad_norm": 0.5355744957923889,
"learning_rate": 9.910489842303544e-05,
"loss": 2.3103,
"step": 288
},
{
"epoch": 0.07262675127222466,
"grad_norm": 0.4253835678100586,
"learning_rate": 9.909735174510467e-05,
"loss": 2.5327,
"step": 289
},
{
"epoch": 0.07287805490984482,
"grad_norm": 0.2699333727359772,
"learning_rate": 9.908977367699926e-05,
"loss": 1.4629,
"step": 290
},
{
"epoch": 0.07312935854746497,
"grad_norm": 0.44158613681793213,
"learning_rate": 9.90821642235642e-05,
"loss": 2.1242,
"step": 291
},
{
"epoch": 0.07338066218508513,
"grad_norm": 0.33713415265083313,
"learning_rate": 9.907452338966457e-05,
"loss": 2.5233,
"step": 292
},
{
"epoch": 0.07363196582270529,
"grad_norm": 0.21191152930259705,
"learning_rate": 9.906685118018549e-05,
"loss": 1.9315,
"step": 293
},
{
"epoch": 0.07388326946032543,
"grad_norm": 0.3546162247657776,
"learning_rate": 9.905914760003216e-05,
"loss": 2.3661,
"step": 294
},
{
"epoch": 0.07413457309794559,
"grad_norm": 0.33006587624549866,
"learning_rate": 9.905141265412984e-05,
"loss": 1.7989,
"step": 295
},
{
"epoch": 0.07438587673556575,
"grad_norm": 0.4529229700565338,
"learning_rate": 9.904364634742385e-05,
"loss": 2.4695,
"step": 296
},
{
"epoch": 0.0746371803731859,
"grad_norm": 0.29525983333587646,
"learning_rate": 9.90358486848795e-05,
"loss": 2.1218,
"step": 297
},
{
"epoch": 0.07488848401080606,
"grad_norm": 0.273483544588089,
"learning_rate": 9.902801967148219e-05,
"loss": 2.4696,
"step": 298
},
{
"epoch": 0.07513978764842622,
"grad_norm": 0.3772584795951843,
"learning_rate": 9.902015931223742e-05,
"loss": 2.3749,
"step": 299
},
{
"epoch": 0.07539109128604636,
"grad_norm": 0.15759634971618652,
"learning_rate": 9.901226761217062e-05,
"loss": 0.7498,
"step": 300
},
{
"epoch": 0.07564239492366652,
"grad_norm": 0.2802472710609436,
"learning_rate": 9.900434457632734e-05,
"loss": 2.3353,
"step": 301
},
{
"epoch": 0.07589369856128668,
"grad_norm": 0.7529959678649902,
"learning_rate": 9.899639020977314e-05,
"loss": 1.7885,
"step": 302
},
{
"epoch": 0.07614500219890682,
"grad_norm": 0.3906523585319519,
"learning_rate": 9.89884045175936e-05,
"loss": 2.4677,
"step": 303
},
{
"epoch": 0.07639630583652698,
"grad_norm": 0.31307291984558105,
"learning_rate": 9.898038750489433e-05,
"loss": 2.408,
"step": 304
},
{
"epoch": 0.07664760947414714,
"grad_norm": 0.3155834674835205,
"learning_rate": 9.897233917680098e-05,
"loss": 2.0308,
"step": 305
},
{
"epoch": 0.07689891311176729,
"grad_norm": 0.2838704586029053,
"learning_rate": 9.896425953845923e-05,
"loss": 2.1338,
"step": 306
},
{
"epoch": 0.07715021674938745,
"grad_norm": 0.3904295563697815,
"learning_rate": 9.895614859503472e-05,
"loss": 2.4403,
"step": 307
},
{
"epoch": 0.07740152038700761,
"grad_norm": 0.27329105138778687,
"learning_rate": 9.89480063517132e-05,
"loss": 1.7174,
"step": 308
},
{
"epoch": 0.07765282402462775,
"grad_norm": 0.20863182842731476,
"learning_rate": 9.893983281370034e-05,
"loss": 2.1524,
"step": 309
},
{
"epoch": 0.07790412766224791,
"grad_norm": 0.3970401883125305,
"learning_rate": 9.893162798622185e-05,
"loss": 3.0095,
"step": 310
},
{
"epoch": 0.07815543129986807,
"grad_norm": 0.3557833731174469,
"learning_rate": 9.892339187452347e-05,
"loss": 2.2407,
"step": 311
},
{
"epoch": 0.07840673493748822,
"grad_norm": 0.3668101131916046,
"learning_rate": 9.891512448387092e-05,
"loss": 1.8899,
"step": 312
},
{
"epoch": 0.07865803857510838,
"grad_norm": 0.2897646129131317,
"learning_rate": 9.890682581954991e-05,
"loss": 2.4487,
"step": 313
},
{
"epoch": 0.07890934221272852,
"grad_norm": 0.31541863083839417,
"learning_rate": 9.889849588686617e-05,
"loss": 1.9136,
"step": 314
},
{
"epoch": 0.07916064585034868,
"grad_norm": 0.4142214357852936,
"learning_rate": 9.889013469114539e-05,
"loss": 2.4315,
"step": 315
},
{
"epoch": 0.07941194948796884,
"grad_norm": 0.39966338872909546,
"learning_rate": 9.888174223773325e-05,
"loss": 2.0768,
"step": 316
},
{
"epoch": 0.07966325312558899,
"grad_norm": 0.3038378357887268,
"learning_rate": 9.887331853199546e-05,
"loss": 2.4825,
"step": 317
},
{
"epoch": 0.07991455676320915,
"grad_norm": 0.36689433455467224,
"learning_rate": 9.886486357931767e-05,
"loss": 2.3153,
"step": 318
},
{
"epoch": 0.0801658604008293,
"grad_norm": 0.3301517963409424,
"learning_rate": 9.885637738510551e-05,
"loss": 2.4927,
"step": 319
},
{
"epoch": 0.08041716403844945,
"grad_norm": 0.26406022906303406,
"learning_rate": 9.884785995478458e-05,
"loss": 2.3793,
"step": 320
},
{
"epoch": 0.08066846767606961,
"grad_norm": 0.3765343427658081,
"learning_rate": 9.883931129380049e-05,
"loss": 2.3813,
"step": 321
},
{
"epoch": 0.08091977131368977,
"grad_norm": 0.42643749713897705,
"learning_rate": 9.883073140761876e-05,
"loss": 2.444,
"step": 322
},
{
"epoch": 0.08117107495130992,
"grad_norm": 0.42273804545402527,
"learning_rate": 9.882212030172493e-05,
"loss": 2.3634,
"step": 323
},
{
"epoch": 0.08142237858893007,
"grad_norm": 0.3113279938697815,
"learning_rate": 9.881347798162443e-05,
"loss": 2.236,
"step": 324
},
{
"epoch": 0.08167368222655023,
"grad_norm": 0.4435007870197296,
"learning_rate": 9.880480445284274e-05,
"loss": 2.3091,
"step": 325
},
{
"epoch": 0.08192498586417038,
"grad_norm": 0.5760218501091003,
"learning_rate": 9.879609972092522e-05,
"loss": 1.7405,
"step": 326
},
{
"epoch": 0.08217628950179054,
"grad_norm": 0.3486250638961792,
"learning_rate": 9.878736379143719e-05,
"loss": 1.9632,
"step": 327
},
{
"epoch": 0.0824275931394107,
"grad_norm": 0.33690646290779114,
"learning_rate": 9.877859666996395e-05,
"loss": 2.454,
"step": 328
},
{
"epoch": 0.08267889677703084,
"grad_norm": 0.4438174068927765,
"learning_rate": 9.876979836211069e-05,
"loss": 2.0577,
"step": 329
},
{
"epoch": 0.082930200414651,
"grad_norm": 0.13432446122169495,
"learning_rate": 9.87609688735026e-05,
"loss": 0.8846,
"step": 330
},
{
"epoch": 0.08318150405227116,
"grad_norm": 0.4359930753707886,
"learning_rate": 9.875210820978475e-05,
"loss": 2.1561,
"step": 331
},
{
"epoch": 0.08343280768989131,
"grad_norm": 0.38827580213546753,
"learning_rate": 9.87432163766222e-05,
"loss": 2.2525,
"step": 332
},
{
"epoch": 0.08368411132751147,
"grad_norm": 0.3862955868244171,
"learning_rate": 9.873429337969985e-05,
"loss": 2.3337,
"step": 333
},
{
"epoch": 0.08393541496513161,
"grad_norm": 0.3881922960281372,
"learning_rate": 9.872533922472264e-05,
"loss": 2.3624,
"step": 334
},
{
"epoch": 0.08418671860275177,
"grad_norm": 0.21210597455501556,
"learning_rate": 9.871635391741533e-05,
"loss": 2.0269,
"step": 335
},
{
"epoch": 0.08443802224037193,
"grad_norm": 0.38510724902153015,
"learning_rate": 9.870733746352265e-05,
"loss": 2.1879,
"step": 336
},
{
"epoch": 0.08468932587799208,
"grad_norm": 0.31321218609809875,
"learning_rate": 9.869828986880924e-05,
"loss": 1.8977,
"step": 337
},
{
"epoch": 0.08494062951561224,
"grad_norm": 0.3835349678993225,
"learning_rate": 9.868921113905961e-05,
"loss": 2.5596,
"step": 338
},
{
"epoch": 0.0851919331532324,
"grad_norm": 0.2992005944252014,
"learning_rate": 9.868010128007823e-05,
"loss": 2.3065,
"step": 339
},
{
"epoch": 0.08544323679085254,
"grad_norm": 0.31576502323150635,
"learning_rate": 9.867096029768943e-05,
"loss": 2.4326,
"step": 340
},
{
"epoch": 0.0856945404284727,
"grad_norm": 0.265114426612854,
"learning_rate": 9.866178819773747e-05,
"loss": 2.4567,
"step": 341
},
{
"epoch": 0.08594584406609286,
"grad_norm": 0.40755563974380493,
"learning_rate": 9.86525849860865e-05,
"loss": 2.4134,
"step": 342
},
{
"epoch": 0.086197147703713,
"grad_norm": 0.35845133662223816,
"learning_rate": 9.864335066862054e-05,
"loss": 2.1912,
"step": 343
},
{
"epoch": 0.08644845134133317,
"grad_norm": 0.2640887498855591,
"learning_rate": 9.863408525124349e-05,
"loss": 2.3818,
"step": 344
},
{
"epoch": 0.08669975497895333,
"grad_norm": 0.19019848108291626,
"learning_rate": 9.862478873987919e-05,
"loss": 1.2064,
"step": 345
},
{
"epoch": 0.08695105861657347,
"grad_norm": 0.3398876488208771,
"learning_rate": 9.861546114047131e-05,
"loss": 2.513,
"step": 346
},
{
"epoch": 0.08720236225419363,
"grad_norm": 0.3897905945777893,
"learning_rate": 9.86061024589834e-05,
"loss": 2.3802,
"step": 347
},
{
"epoch": 0.08745366589181379,
"grad_norm": 0.5403574109077454,
"learning_rate": 9.859671270139892e-05,
"loss": 2.492,
"step": 348
},
{
"epoch": 0.08770496952943393,
"grad_norm": 0.38538798689842224,
"learning_rate": 9.858729187372114e-05,
"loss": 2.6697,
"step": 349
},
{
"epoch": 0.0879562731670541,
"grad_norm": 0.47243911027908325,
"learning_rate": 9.857783998197321e-05,
"loss": 2.5493,
"step": 350
},
{
"epoch": 0.08820757680467425,
"grad_norm": 0.24692249298095703,
"learning_rate": 9.85683570321982e-05,
"loss": 1.9117,
"step": 351
},
{
"epoch": 0.0884588804422944,
"grad_norm": 0.35984736680984497,
"learning_rate": 9.855884303045897e-05,
"loss": 2.0952,
"step": 352
},
{
"epoch": 0.08871018407991456,
"grad_norm": 0.4461250901222229,
"learning_rate": 9.854929798283826e-05,
"loss": 2.1836,
"step": 353
},
{
"epoch": 0.08896148771753472,
"grad_norm": 0.4806773066520691,
"learning_rate": 9.853972189543864e-05,
"loss": 2.0937,
"step": 354
},
{
"epoch": 0.08921279135515486,
"grad_norm": 0.5709269642829895,
"learning_rate": 9.853011477438254e-05,
"loss": 1.9677,
"step": 355
},
{
"epoch": 0.08946409499277502,
"grad_norm": 0.3929988443851471,
"learning_rate": 9.852047662581225e-05,
"loss": 2.5909,
"step": 356
},
{
"epoch": 0.08971539863039517,
"grad_norm": 0.33998608589172363,
"learning_rate": 9.851080745588987e-05,
"loss": 2.0388,
"step": 357
},
{
"epoch": 0.08996670226801533,
"grad_norm": 0.4756825268268585,
"learning_rate": 9.850110727079735e-05,
"loss": 2.1712,
"step": 358
},
{
"epoch": 0.09021800590563549,
"grad_norm": 0.3752896785736084,
"learning_rate": 9.849137607673643e-05,
"loss": 2.4498,
"step": 359
},
{
"epoch": 0.09046930954325563,
"grad_norm": 0.4232666790485382,
"learning_rate": 9.848161387992874e-05,
"loss": 2.0123,
"step": 360
},
{
"epoch": 0.09072061318087579,
"grad_norm": 0.4741215705871582,
"learning_rate": 9.847182068661567e-05,
"loss": 2.0118,
"step": 361
},
{
"epoch": 0.09097191681849595,
"grad_norm": 0.41064271330833435,
"learning_rate": 9.846199650305846e-05,
"loss": 2.2685,
"step": 362
},
{
"epoch": 0.0912232204561161,
"grad_norm": 0.31049537658691406,
"learning_rate": 9.845214133553817e-05,
"loss": 1.9623,
"step": 363
},
{
"epoch": 0.09147452409373626,
"grad_norm": 0.2547619342803955,
"learning_rate": 9.844225519035565e-05,
"loss": 1.8739,
"step": 364
},
{
"epoch": 0.09172582773135642,
"grad_norm": 0.26991865038871765,
"learning_rate": 9.843233807383159e-05,
"loss": 2.2907,
"step": 365
},
{
"epoch": 0.09197713136897656,
"grad_norm": 0.3443757891654968,
"learning_rate": 9.84223899923064e-05,
"loss": 2.1534,
"step": 366
},
{
"epoch": 0.09222843500659672,
"grad_norm": 0.2624412477016449,
"learning_rate": 9.841241095214038e-05,
"loss": 2.4799,
"step": 367
},
{
"epoch": 0.09247973864421688,
"grad_norm": 0.34518447518348694,
"learning_rate": 9.840240095971358e-05,
"loss": 2.2512,
"step": 368
},
{
"epoch": 0.09273104228183703,
"grad_norm": 0.4448896646499634,
"learning_rate": 9.839236002142584e-05,
"loss": 2.3634,
"step": 369
},
{
"epoch": 0.09298234591945718,
"grad_norm": 0.1713920682668686,
"learning_rate": 9.83822881436968e-05,
"loss": 1.9883,
"step": 370
},
{
"epoch": 0.09323364955707734,
"grad_norm": 0.26538336277008057,
"learning_rate": 9.837218533296587e-05,
"loss": 2.3259,
"step": 371
},
{
"epoch": 0.09348495319469749,
"grad_norm": 0.3171433210372925,
"learning_rate": 9.83620515956922e-05,
"loss": 2.3368,
"step": 372
},
{
"epoch": 0.09373625683231765,
"grad_norm": 0.47525712847709656,
"learning_rate": 9.83518869383548e-05,
"loss": 2.0206,
"step": 373
},
{
"epoch": 0.09398756046993781,
"grad_norm": 0.6884750723838806,
"learning_rate": 9.834169136745237e-05,
"loss": 2.2423,
"step": 374
},
{
"epoch": 0.09423886410755795,
"grad_norm": 0.3786754012107849,
"learning_rate": 9.833146488950342e-05,
"loss": 2.5151,
"step": 375
},
{
"epoch": 0.09449016774517811,
"grad_norm": 0.15857549011707306,
"learning_rate": 9.832120751104617e-05,
"loss": 1.4356,
"step": 376
},
{
"epoch": 0.09474147138279826,
"grad_norm": 0.6414403915405273,
"learning_rate": 9.831091923863868e-05,
"loss": 2.2734,
"step": 377
},
{
"epoch": 0.09499277502041842,
"grad_norm": 0.25082263350486755,
"learning_rate": 9.830060007885868e-05,
"loss": 2.8101,
"step": 378
},
{
"epoch": 0.09524407865803858,
"grad_norm": 0.29801589250564575,
"learning_rate": 9.829025003830368e-05,
"loss": 1.8527,
"step": 379
},
{
"epoch": 0.09549538229565872,
"grad_norm": 0.293905109167099,
"learning_rate": 9.827986912359094e-05,
"loss": 2.1692,
"step": 380
},
{
"epoch": 0.09574668593327888,
"grad_norm": 0.5134365558624268,
"learning_rate": 9.826945734135744e-05,
"loss": 2.1052,
"step": 381
},
{
"epoch": 0.09599798957089904,
"grad_norm": 0.24011516571044922,
"learning_rate": 9.825901469825994e-05,
"loss": 1.8188,
"step": 382
},
{
"epoch": 0.09624929320851919,
"grad_norm": 0.39140889048576355,
"learning_rate": 9.824854120097485e-05,
"loss": 2.1372,
"step": 383
},
{
"epoch": 0.09650059684613935,
"grad_norm": 0.35121777653694153,
"learning_rate": 9.82380368561984e-05,
"loss": 2.2186,
"step": 384
},
{
"epoch": 0.0967519004837595,
"grad_norm": 0.3034665286540985,
"learning_rate": 9.822750167064645e-05,
"loss": 2.2167,
"step": 385
},
{
"epoch": 0.09700320412137965,
"grad_norm": 0.18376107513904572,
"learning_rate": 9.821693565105465e-05,
"loss": 1.573,
"step": 386
},
{
"epoch": 0.09725450775899981,
"grad_norm": 0.18543782830238342,
"learning_rate": 9.820633880417836e-05,
"loss": 2.0766,
"step": 387
},
{
"epoch": 0.09750581139661997,
"grad_norm": 0.3020473122596741,
"learning_rate": 9.819571113679258e-05,
"loss": 2.4098,
"step": 388
},
{
"epoch": 0.09775711503424012,
"grad_norm": 0.43857908248901367,
"learning_rate": 9.818505265569209e-05,
"loss": 2.3636,
"step": 389
},
{
"epoch": 0.09800841867186028,
"grad_norm": 0.31539830565452576,
"learning_rate": 9.817436336769135e-05,
"loss": 2.3027,
"step": 390
},
{
"epoch": 0.09825972230948044,
"grad_norm": 0.38662660121917725,
"learning_rate": 9.816364327962449e-05,
"loss": 2.7305,
"step": 391
},
{
"epoch": 0.09851102594710058,
"grad_norm": 0.6164030432701111,
"learning_rate": 9.815289239834536e-05,
"loss": 1.8858,
"step": 392
},
{
"epoch": 0.09876232958472074,
"grad_norm": 0.2521904408931732,
"learning_rate": 9.814211073072748e-05,
"loss": 1.9274,
"step": 393
},
{
"epoch": 0.0990136332223409,
"grad_norm": 0.35303381085395813,
"learning_rate": 9.813129828366407e-05,
"loss": 2.1945,
"step": 394
},
{
"epoch": 0.09926493685996104,
"grad_norm": 0.276737242937088,
"learning_rate": 9.812045506406803e-05,
"loss": 2.2649,
"step": 395
},
{
"epoch": 0.0995162404975812,
"grad_norm": 0.45490264892578125,
"learning_rate": 9.81095810788719e-05,
"loss": 2.37,
"step": 396
},
{
"epoch": 0.09976754413520136,
"grad_norm": 0.5738433599472046,
"learning_rate": 9.809867633502794e-05,
"loss": 2.1393,
"step": 397
},
{
"epoch": 0.10001884777282151,
"grad_norm": 0.2685263454914093,
"learning_rate": 9.808774083950802e-05,
"loss": 2.6325,
"step": 398
},
{
"epoch": 0.10027015141044167,
"grad_norm": 0.41066989302635193,
"learning_rate": 9.807677459930374e-05,
"loss": 2.1897,
"step": 399
},
{
"epoch": 0.10052145504806181,
"grad_norm": 0.41453301906585693,
"learning_rate": 9.806577762142628e-05,
"loss": 2.243,
"step": 400
},
{
"epoch": 0.10077275868568197,
"grad_norm": 0.39212143421173096,
"learning_rate": 9.805474991290652e-05,
"loss": 2.502,
"step": 401
},
{
"epoch": 0.10102406232330213,
"grad_norm": 0.23721270263195038,
"learning_rate": 9.804369148079498e-05,
"loss": 1.7259,
"step": 402
},
{
"epoch": 0.10127536596092228,
"grad_norm": 0.22118382155895233,
"learning_rate": 9.803260233216184e-05,
"loss": 1.1238,
"step": 403
},
{
"epoch": 0.10152666959854244,
"grad_norm": 0.386078804731369,
"learning_rate": 9.802148247409686e-05,
"loss": 2.1155,
"step": 404
},
{
"epoch": 0.1017779732361626,
"grad_norm": 0.3332570195198059,
"learning_rate": 9.80103319137095e-05,
"loss": 2.5907,
"step": 405
},
{
"epoch": 0.10202927687378274,
"grad_norm": 0.20631489157676697,
"learning_rate": 9.799915065812882e-05,
"loss": 2.3401,
"step": 406
},
{
"epoch": 0.1022805805114029,
"grad_norm": 0.529591977596283,
"learning_rate": 9.798793871450346e-05,
"loss": 2.3916,
"step": 407
},
{
"epoch": 0.10253188414902306,
"grad_norm": 0.32578417658805847,
"learning_rate": 9.79766960900018e-05,
"loss": 2.3661,
"step": 408
},
{
"epoch": 0.10278318778664321,
"grad_norm": 0.4134072959423065,
"learning_rate": 9.796542279181172e-05,
"loss": 2.0656,
"step": 409
},
{
"epoch": 0.10303449142426337,
"grad_norm": 0.3845951557159424,
"learning_rate": 9.795411882714076e-05,
"loss": 2.3176,
"step": 410
},
{
"epoch": 0.10328579506188353,
"grad_norm": 0.6262491345405579,
"learning_rate": 9.794278420321605e-05,
"loss": 2.42,
"step": 411
},
{
"epoch": 0.10353709869950367,
"grad_norm": 0.4291568696498871,
"learning_rate": 9.793141892728436e-05,
"loss": 1.9455,
"step": 412
},
{
"epoch": 0.10378840233712383,
"grad_norm": 0.24446240067481995,
"learning_rate": 9.792002300661201e-05,
"loss": 2.6102,
"step": 413
},
{
"epoch": 0.10403970597474399,
"grad_norm": 0.4551761746406555,
"learning_rate": 9.79085964484849e-05,
"loss": 2.0785,
"step": 414
},
{
"epoch": 0.10429100961236414,
"grad_norm": 0.384036123752594,
"learning_rate": 9.789713926020863e-05,
"loss": 2.3401,
"step": 415
},
{
"epoch": 0.1045423132499843,
"grad_norm": 0.2475469559431076,
"learning_rate": 9.788565144910822e-05,
"loss": 2.2387,
"step": 416
},
{
"epoch": 0.10479361688760445,
"grad_norm": 0.269940048456192,
"learning_rate": 9.78741330225284e-05,
"loss": 2.2894,
"step": 417
},
{
"epoch": 0.1050449205252246,
"grad_norm": 0.27409201860427856,
"learning_rate": 9.786258398783341e-05,
"loss": 2.2478,
"step": 418
},
{
"epoch": 0.10529622416284476,
"grad_norm": 0.28689828515052795,
"learning_rate": 9.785100435240706e-05,
"loss": 1.9406,
"step": 419
},
{
"epoch": 0.10554752780046492,
"grad_norm": 0.5282906293869019,
"learning_rate": 9.783939412365278e-05,
"loss": 2.1423,
"step": 420
},
{
"epoch": 0.10579883143808506,
"grad_norm": 0.2722564935684204,
"learning_rate": 9.782775330899347e-05,
"loss": 2.3398,
"step": 421
},
{
"epoch": 0.10605013507570522,
"grad_norm": 0.22443710267543793,
"learning_rate": 9.781608191587166e-05,
"loss": 2.2233,
"step": 422
},
{
"epoch": 0.10630143871332537,
"grad_norm": 1.7654945850372314,
"learning_rate": 9.78043799517494e-05,
"loss": 2.2441,
"step": 423
},
{
"epoch": 0.10655274235094553,
"grad_norm": 0.272491455078125,
"learning_rate": 9.779264742410829e-05,
"loss": 2.4267,
"step": 424
},
{
"epoch": 0.10680404598856569,
"grad_norm": 0.4860396683216095,
"learning_rate": 9.778088434044945e-05,
"loss": 2.3139,
"step": 425
},
{
"epoch": 0.10705534962618583,
"grad_norm": 0.5187298059463501,
"learning_rate": 9.77690907082936e-05,
"loss": 2.0393,
"step": 426
},
{
"epoch": 0.10730665326380599,
"grad_norm": 0.25615808367729187,
"learning_rate": 9.775726653518091e-05,
"loss": 2.4811,
"step": 427
},
{
"epoch": 0.10755795690142615,
"grad_norm": 0.26843705773353577,
"learning_rate": 9.774541182867112e-05,
"loss": 2.0025,
"step": 428
},
{
"epoch": 0.1078092605390463,
"grad_norm": 0.42573508620262146,
"learning_rate": 9.773352659634348e-05,
"loss": 2.3841,
"step": 429
},
{
"epoch": 0.10806056417666646,
"grad_norm": 0.34817081689834595,
"learning_rate": 9.772161084579679e-05,
"loss": 2.4695,
"step": 430
},
{
"epoch": 0.10831186781428662,
"grad_norm": 0.3741927742958069,
"learning_rate": 9.770966458464927e-05,
"loss": 2.099,
"step": 431
},
{
"epoch": 0.10856317145190676,
"grad_norm": 0.4013387858867645,
"learning_rate": 9.769768782053879e-05,
"loss": 2.2978,
"step": 432
},
{
"epoch": 0.10881447508952692,
"grad_norm": 0.4488285779953003,
"learning_rate": 9.768568056112258e-05,
"loss": 2.064,
"step": 433
},
{
"epoch": 0.10906577872714708,
"grad_norm": 0.43745100498199463,
"learning_rate": 9.767364281407745e-05,
"loss": 2.0517,
"step": 434
},
{
"epoch": 0.10931708236476723,
"grad_norm": 0.2905375361442566,
"learning_rate": 9.766157458709967e-05,
"loss": 2.43,
"step": 435
},
{
"epoch": 0.10956838600238739,
"grad_norm": 0.3817865252494812,
"learning_rate": 9.764947588790502e-05,
"loss": 2.439,
"step": 436
},
{
"epoch": 0.10981968964000755,
"grad_norm": 0.44527363777160645,
"learning_rate": 9.763734672422876e-05,
"loss": 2.4307,
"step": 437
},
{
"epoch": 0.11007099327762769,
"grad_norm": 0.5595135688781738,
"learning_rate": 9.76251871038256e-05,
"loss": 2.1648,
"step": 438
},
{
"epoch": 0.11032229691524785,
"grad_norm": 0.4044279456138611,
"learning_rate": 9.761299703446973e-05,
"loss": 2.3435,
"step": 439
},
{
"epoch": 0.11057360055286801,
"grad_norm": 0.1558169573545456,
"learning_rate": 9.760077652395483e-05,
"loss": 0.9334,
"step": 440
},
{
"epoch": 0.11082490419048815,
"grad_norm": 0.42228519916534424,
"learning_rate": 9.758852558009404e-05,
"loss": 2.4764,
"step": 441
},
{
"epoch": 0.11107620782810831,
"grad_norm": 0.18882694840431213,
"learning_rate": 9.757624421071993e-05,
"loss": 1.3726,
"step": 442
},
{
"epoch": 0.11132751146572846,
"grad_norm": 0.37905827164649963,
"learning_rate": 9.756393242368453e-05,
"loss": 1.8814,
"step": 443
},
{
"epoch": 0.11157881510334862,
"grad_norm": 0.347260445356369,
"learning_rate": 9.755159022685936e-05,
"loss": 2.6184,
"step": 444
},
{
"epoch": 0.11183011874096878,
"grad_norm": 0.4600488543510437,
"learning_rate": 9.753921762813534e-05,
"loss": 2.2642,
"step": 445
},
{
"epoch": 0.11208142237858892,
"grad_norm": 0.3841269612312317,
"learning_rate": 9.75268146354228e-05,
"loss": 2.2473,
"step": 446
},
{
"epoch": 0.11233272601620908,
"grad_norm": 0.4765447676181793,
"learning_rate": 9.751438125665158e-05,
"loss": 2.3474,
"step": 447
},
{
"epoch": 0.11258402965382924,
"grad_norm": 0.3766055405139923,
"learning_rate": 9.750191749977089e-05,
"loss": 1.809,
"step": 448
},
{
"epoch": 0.11283533329144939,
"grad_norm": 0.3452647626399994,
"learning_rate": 9.748942337274938e-05,
"loss": 2.5878,
"step": 449
},
{
"epoch": 0.11308663692906955,
"grad_norm": 0.21866516768932343,
"learning_rate": 9.747689888357509e-05,
"loss": 0.992,
"step": 450
},
{
"epoch": 0.11333794056668971,
"grad_norm": 0.36812421679496765,
"learning_rate": 9.746434404025555e-05,
"loss": 2.0005,
"step": 451
},
{
"epoch": 0.11358924420430985,
"grad_norm": 0.44335392117500305,
"learning_rate": 9.74517588508176e-05,
"loss": 2.4726,
"step": 452
},
{
"epoch": 0.11384054784193001,
"grad_norm": 0.448779433965683,
"learning_rate": 9.743914332330754e-05,
"loss": 2.2657,
"step": 453
},
{
"epoch": 0.11409185147955017,
"grad_norm": 0.5500572323799133,
"learning_rate": 9.742649746579105e-05,
"loss": 2.4726,
"step": 454
},
{
"epoch": 0.11434315511717032,
"grad_norm": 0.47495442628860474,
"learning_rate": 9.741382128635321e-05,
"loss": 2.4044,
"step": 455
},
{
"epoch": 0.11459445875479048,
"grad_norm": 0.4240530729293823,
"learning_rate": 9.740111479309847e-05,
"loss": 2.6976,
"step": 456
},
{
"epoch": 0.11484576239241064,
"grad_norm": 0.48315781354904175,
"learning_rate": 9.738837799415067e-05,
"loss": 2.4035,
"step": 457
},
{
"epoch": 0.11509706603003078,
"grad_norm": 0.28931689262390137,
"learning_rate": 9.737561089765303e-05,
"loss": 1.9762,
"step": 458
},
{
"epoch": 0.11534836966765094,
"grad_norm": 0.2892288267612457,
"learning_rate": 9.736281351176813e-05,
"loss": 2.1718,
"step": 459
},
{
"epoch": 0.1155996733052711,
"grad_norm": 0.4304305911064148,
"learning_rate": 9.734998584467794e-05,
"loss": 2.2799,
"step": 460
},
{
"epoch": 0.11585097694289125,
"grad_norm": 0.2887556552886963,
"learning_rate": 9.733712790458375e-05,
"loss": 2.4617,
"step": 461
},
{
"epoch": 0.1161022805805114,
"grad_norm": 0.37903836369514465,
"learning_rate": 9.732423969970626e-05,
"loss": 2.4154,
"step": 462
},
{
"epoch": 0.11635358421813156,
"grad_norm": 0.34230297803878784,
"learning_rate": 9.731132123828543e-05,
"loss": 1.9664,
"step": 463
},
{
"epoch": 0.11660488785575171,
"grad_norm": 0.4099084138870239,
"learning_rate": 9.729837252858067e-05,
"loss": 2.473,
"step": 464
},
{
"epoch": 0.11685619149337187,
"grad_norm": 0.4079136252403259,
"learning_rate": 9.728539357887068e-05,
"loss": 1.8914,
"step": 465
},
{
"epoch": 0.11710749513099201,
"grad_norm": 0.2663559317588806,
"learning_rate": 9.727238439745346e-05,
"loss": 2.1168,
"step": 466
},
{
"epoch": 0.11735879876861217,
"grad_norm": 0.35732075572013855,
"learning_rate": 9.72593449926464e-05,
"loss": 2.3938,
"step": 467
},
{
"epoch": 0.11761010240623233,
"grad_norm": 0.26939693093299866,
"learning_rate": 9.724627537278616e-05,
"loss": 1.9446,
"step": 468
},
{
"epoch": 0.11786140604385248,
"grad_norm": 0.3639311194419861,
"learning_rate": 9.72331755462288e-05,
"loss": 2.8929,
"step": 469
},
{
"epoch": 0.11811270968147264,
"grad_norm": 0.26295900344848633,
"learning_rate": 9.722004552134956e-05,
"loss": 2.3815,
"step": 470
},
{
"epoch": 0.1183640133190928,
"grad_norm": 0.4366108179092407,
"learning_rate": 9.720688530654311e-05,
"loss": 2.1099,
"step": 471
},
{
"epoch": 0.11861531695671294,
"grad_norm": 0.5005189776420593,
"learning_rate": 9.719369491022339e-05,
"loss": 2.7562,
"step": 472
},
{
"epoch": 0.1188666205943331,
"grad_norm": 0.3057880401611328,
"learning_rate": 9.718047434082357e-05,
"loss": 2.5181,
"step": 473
},
{
"epoch": 0.11911792423195326,
"grad_norm": 0.2341679483652115,
"learning_rate": 9.716722360679619e-05,
"loss": 1.2066,
"step": 474
},
{
"epoch": 0.11936922786957341,
"grad_norm": 0.27327990531921387,
"learning_rate": 9.715394271661306e-05,
"loss": 2.3469,
"step": 475
},
{
"epoch": 0.11962053150719357,
"grad_norm": 0.39172980189323425,
"learning_rate": 9.714063167876527e-05,
"loss": 2.2347,
"step": 476
},
{
"epoch": 0.11987183514481373,
"grad_norm": 0.49665653705596924,
"learning_rate": 9.71272905017631e-05,
"loss": 1.954,
"step": 477
},
{
"epoch": 0.12012313878243387,
"grad_norm": 0.2704184949398041,
"learning_rate": 9.711391919413626e-05,
"loss": 2.3236,
"step": 478
},
{
"epoch": 0.12037444242005403,
"grad_norm": 0.17571642994880676,
"learning_rate": 9.710051776443358e-05,
"loss": 1.277,
"step": 479
},
{
"epoch": 0.12062574605767419,
"grad_norm": 0.2642328143119812,
"learning_rate": 9.708708622122322e-05,
"loss": 2.4537,
"step": 480
},
{
"epoch": 0.12087704969529434,
"grad_norm": 0.21277543902397156,
"learning_rate": 9.707362457309261e-05,
"loss": 1.3385,
"step": 481
},
{
"epoch": 0.1211283533329145,
"grad_norm": 0.3256551921367645,
"learning_rate": 9.706013282864834e-05,
"loss": 2.1567,
"step": 482
},
{
"epoch": 0.12137965697053466,
"grad_norm": 0.4194876551628113,
"learning_rate": 9.704661099651633e-05,
"loss": 2.2617,
"step": 483
},
{
"epoch": 0.1216309606081548,
"grad_norm": 0.13989388942718506,
"learning_rate": 9.70330590853417e-05,
"loss": 0.8074,
"step": 484
},
{
"epoch": 0.12188226424577496,
"grad_norm": 0.2916125953197479,
"learning_rate": 9.701947710378881e-05,
"loss": 2.0217,
"step": 485
},
{
"epoch": 0.1221335678833951,
"grad_norm": 0.17038998007774353,
"learning_rate": 9.700586506054121e-05,
"loss": 0.8153,
"step": 486
},
{
"epoch": 0.12238487152101526,
"grad_norm": 0.39059990644454956,
"learning_rate": 9.699222296430172e-05,
"loss": 2.161,
"step": 487
},
{
"epoch": 0.12263617515863542,
"grad_norm": 0.5335647463798523,
"learning_rate": 9.697855082379239e-05,
"loss": 2.2604,
"step": 488
},
{
"epoch": 0.12288747879625557,
"grad_norm": 0.4468785524368286,
"learning_rate": 9.696484864775437e-05,
"loss": 2.3022,
"step": 489
},
{
"epoch": 0.12313878243387573,
"grad_norm": 0.38395336270332336,
"learning_rate": 9.695111644494814e-05,
"loss": 2.2467,
"step": 490
},
{
"epoch": 0.12339008607149589,
"grad_norm": 0.35352465510368347,
"learning_rate": 9.693735422415332e-05,
"loss": 2.7816,
"step": 491
},
{
"epoch": 0.12364138970911603,
"grad_norm": 0.5462369322776794,
"learning_rate": 9.692356199416868e-05,
"loss": 2.2723,
"step": 492
},
{
"epoch": 0.1238926933467362,
"grad_norm": 0.24535368382930756,
"learning_rate": 9.690973976381228e-05,
"loss": 2.0469,
"step": 493
},
{
"epoch": 0.12414399698435635,
"grad_norm": 0.3857629597187042,
"learning_rate": 9.689588754192126e-05,
"loss": 2.1396,
"step": 494
},
{
"epoch": 0.1243953006219765,
"grad_norm": 0.4369061291217804,
"learning_rate": 9.688200533735199e-05,
"loss": 2.3099,
"step": 495
},
{
"epoch": 0.12464660425959666,
"grad_norm": 0.22359801828861237,
"learning_rate": 9.686809315898e-05,
"loss": 2.113,
"step": 496
},
{
"epoch": 0.12489790789721682,
"grad_norm": 0.3624935746192932,
"learning_rate": 9.685415101569999e-05,
"loss": 2.2168,
"step": 497
},
{
"epoch": 0.12514921153483696,
"grad_norm": 0.4480370581150055,
"learning_rate": 9.684017891642578e-05,
"loss": 2.5805,
"step": 498
},
{
"epoch": 0.12540051517245712,
"grad_norm": 0.47670865058898926,
"learning_rate": 9.682617687009039e-05,
"loss": 1.5993,
"step": 499
},
{
"epoch": 0.12565181881007728,
"grad_norm": 0.37690502405166626,
"learning_rate": 9.681214488564596e-05,
"loss": 2.125,
"step": 500
},
{
"epoch": 0.12590312244769744,
"grad_norm": 0.29597729444503784,
"learning_rate": 9.679808297206377e-05,
"loss": 2.6068,
"step": 501
},
{
"epoch": 0.12615442608531757,
"grad_norm": 0.33130696415901184,
"learning_rate": 9.678399113833425e-05,
"loss": 2.2649,
"step": 502
},
{
"epoch": 0.12640572972293773,
"grad_norm": 0.4030790627002716,
"learning_rate": 9.676986939346696e-05,
"loss": 1.8589,
"step": 503
},
{
"epoch": 0.1266570333605579,
"grad_norm": 0.37136292457580566,
"learning_rate": 9.675571774649057e-05,
"loss": 2.1863,
"step": 504
},
{
"epoch": 0.12690833699817805,
"grad_norm": 0.439748615026474,
"learning_rate": 9.674153620645287e-05,
"loss": 2.6519,
"step": 505
},
{
"epoch": 0.1271596406357982,
"grad_norm": 0.3839961588382721,
"learning_rate": 9.672732478242075e-05,
"loss": 2.4758,
"step": 506
},
{
"epoch": 0.12741094427341837,
"grad_norm": 0.2992021441459656,
"learning_rate": 9.671308348348025e-05,
"loss": 2.4348,
"step": 507
},
{
"epoch": 0.1276622479110385,
"grad_norm": 0.3640328645706177,
"learning_rate": 9.669881231873646e-05,
"loss": 2.6048,
"step": 508
},
{
"epoch": 0.12791355154865866,
"grad_norm": 0.27911099791526794,
"learning_rate": 9.66845112973136e-05,
"loss": 2.4449,
"step": 509
},
{
"epoch": 0.12816485518627882,
"grad_norm": 0.4001411199569702,
"learning_rate": 9.667018042835496e-05,
"loss": 3.0271,
"step": 510
},
{
"epoch": 0.12841615882389898,
"grad_norm": 0.4469778537750244,
"learning_rate": 9.665581972102291e-05,
"loss": 2.192,
"step": 511
},
{
"epoch": 0.12866746246151914,
"grad_norm": 0.4696493446826935,
"learning_rate": 9.66414291844989e-05,
"loss": 2.0927,
"step": 512
},
{
"epoch": 0.1289187660991393,
"grad_norm": 0.485324889421463,
"learning_rate": 9.662700882798348e-05,
"loss": 1.8437,
"step": 513
},
{
"epoch": 0.12917006973675943,
"grad_norm": 0.2740216851234436,
"learning_rate": 9.661255866069622e-05,
"loss": 2.0895,
"step": 514
},
{
"epoch": 0.1294213733743796,
"grad_norm": 0.48760735988616943,
"learning_rate": 9.659807869187578e-05,
"loss": 2.3416,
"step": 515
},
{
"epoch": 0.12967267701199975,
"grad_norm": 0.14928022027015686,
"learning_rate": 9.658356893077987e-05,
"loss": 1.1877,
"step": 516
},
{
"epoch": 0.1299239806496199,
"grad_norm": 0.46163851022720337,
"learning_rate": 9.656902938668524e-05,
"loss": 2.6305,
"step": 517
},
{
"epoch": 0.13017528428724007,
"grad_norm": 0.30622944235801697,
"learning_rate": 9.655446006888766e-05,
"loss": 1.5633,
"step": 518
},
{
"epoch": 0.1304265879248602,
"grad_norm": 0.30690157413482666,
"learning_rate": 9.653986098670198e-05,
"loss": 2.5689,
"step": 519
},
{
"epoch": 0.13067789156248036,
"grad_norm": 0.3105219602584839,
"learning_rate": 9.652523214946205e-05,
"loss": 2.564,
"step": 520
},
{
"epoch": 0.13092919520010052,
"grad_norm": 0.2955935299396515,
"learning_rate": 9.651057356652077e-05,
"loss": 2.5073,
"step": 521
},
{
"epoch": 0.13118049883772068,
"grad_norm": 0.19297459721565247,
"learning_rate": 9.649588524725002e-05,
"loss": 1.8412,
"step": 522
},
{
"epoch": 0.13143180247534084,
"grad_norm": 0.3480035066604614,
"learning_rate": 9.64811672010407e-05,
"loss": 2.5528,
"step": 523
},
{
"epoch": 0.131683106112961,
"grad_norm": 0.15509484708309174,
"learning_rate": 9.646641943730277e-05,
"loss": 0.7748,
"step": 524
},
{
"epoch": 0.13193440975058113,
"grad_norm": 0.33523187041282654,
"learning_rate": 9.645164196546512e-05,
"loss": 1.939,
"step": 525
},
{
"epoch": 0.1321857133882013,
"grad_norm": 0.3940199613571167,
"learning_rate": 9.643683479497567e-05,
"loss": 2.3232,
"step": 526
},
{
"epoch": 0.13243701702582145,
"grad_norm": 0.3470746576786041,
"learning_rate": 9.64219979353013e-05,
"loss": 1.933,
"step": 527
},
{
"epoch": 0.1326883206634416,
"grad_norm": 0.3262689709663391,
"learning_rate": 9.640713139592792e-05,
"loss": 2.3454,
"step": 528
},
{
"epoch": 0.13293962430106177,
"grad_norm": 0.5404649972915649,
"learning_rate": 9.639223518636036e-05,
"loss": 2.4712,
"step": 529
},
{
"epoch": 0.13319092793868192,
"grad_norm": 0.30311527848243713,
"learning_rate": 9.637730931612245e-05,
"loss": 2.0744,
"step": 530
},
{
"epoch": 0.13344223157630206,
"grad_norm": 0.9992802739143372,
"learning_rate": 9.6362353794757e-05,
"loss": 2.1378,
"step": 531
},
{
"epoch": 0.13369353521392222,
"grad_norm": 0.2992432415485382,
"learning_rate": 9.634736863182574e-05,
"loss": 2.4675,
"step": 532
},
{
"epoch": 0.13394483885154237,
"grad_norm": 0.3760550618171692,
"learning_rate": 9.633235383690937e-05,
"loss": 2.3762,
"step": 533
},
{
"epoch": 0.13419614248916253,
"grad_norm": 0.3664836287498474,
"learning_rate": 9.631730941960752e-05,
"loss": 2.2417,
"step": 534
},
{
"epoch": 0.1344474461267827,
"grad_norm": 0.4195888042449951,
"learning_rate": 9.630223538953881e-05,
"loss": 1.9261,
"step": 535
},
{
"epoch": 0.13469874976440285,
"grad_norm": 0.39428257942199707,
"learning_rate": 9.628713175634072e-05,
"loss": 2.2189,
"step": 536
},
{
"epoch": 0.13495005340202298,
"grad_norm": 0.3145690858364105,
"learning_rate": 9.627199852966969e-05,
"loss": 2.1252,
"step": 537
},
{
"epoch": 0.13520135703964314,
"grad_norm": 0.41197821497917175,
"learning_rate": 9.625683571920108e-05,
"loss": 2.0061,
"step": 538
},
{
"epoch": 0.1354526606772633,
"grad_norm": 0.4640690088272095,
"learning_rate": 9.62416433346292e-05,
"loss": 1.9037,
"step": 539
},
{
"epoch": 0.13570396431488346,
"grad_norm": 0.39173510670661926,
"learning_rate": 9.62264213856672e-05,
"loss": 2.1323,
"step": 540
},
{
"epoch": 0.13595526795250362,
"grad_norm": 0.33818063139915466,
"learning_rate": 9.62111698820472e-05,
"loss": 2.4662,
"step": 541
},
{
"epoch": 0.13620657159012375,
"grad_norm": 0.3888367712497711,
"learning_rate": 9.619588883352011e-05,
"loss": 1.987,
"step": 542
},
{
"epoch": 0.1364578752277439,
"grad_norm": 0.49469324946403503,
"learning_rate": 9.61805782498559e-05,
"loss": 2.075,
"step": 543
},
{
"epoch": 0.13670917886536407,
"grad_norm": 0.3214045464992523,
"learning_rate": 9.616523814084324e-05,
"loss": 2.6687,
"step": 544
},
{
"epoch": 0.13696048250298423,
"grad_norm": 0.3943631649017334,
"learning_rate": 9.61498685162898e-05,
"loss": 1.9817,
"step": 545
},
{
"epoch": 0.1372117861406044,
"grad_norm": 0.46393269300460815,
"learning_rate": 9.613446938602209e-05,
"loss": 2.2147,
"step": 546
},
{
"epoch": 0.13746308977822455,
"grad_norm": 0.3647920489311218,
"learning_rate": 9.611904075988544e-05,
"loss": 1.9163,
"step": 547
},
{
"epoch": 0.13771439341584468,
"grad_norm": 0.3861480951309204,
"learning_rate": 9.610358264774411e-05,
"loss": 2.7924,
"step": 548
},
{
"epoch": 0.13796569705346484,
"grad_norm": 0.4985499083995819,
"learning_rate": 9.608809505948114e-05,
"loss": 2.0298,
"step": 549
},
{
"epoch": 0.138217000691085,
"grad_norm": 0.4473382532596588,
"learning_rate": 9.607257800499849e-05,
"loss": 2.0965,
"step": 550
},
{
"epoch": 0.13846830432870516,
"grad_norm": 0.4468131959438324,
"learning_rate": 9.60570314942169e-05,
"loss": 2.311,
"step": 551
},
{
"epoch": 0.13871960796632532,
"grad_norm": 0.1555897295475006,
"learning_rate": 9.604145553707595e-05,
"loss": 0.8849,
"step": 552
},
{
"epoch": 0.13897091160394548,
"grad_norm": 0.17197562754154205,
"learning_rate": 9.602585014353409e-05,
"loss": 1.0516,
"step": 553
},
{
"epoch": 0.1392222152415656,
"grad_norm": 0.22020426392555237,
"learning_rate": 9.601021532356854e-05,
"loss": 1.9116,
"step": 554
},
{
"epoch": 0.13947351887918577,
"grad_norm": 0.33761653304100037,
"learning_rate": 9.599455108717535e-05,
"loss": 2.3363,
"step": 555
},
{
"epoch": 0.13972482251680593,
"grad_norm": 0.5171981453895569,
"learning_rate": 9.59788574443694e-05,
"loss": 2.0701,
"step": 556
},
{
"epoch": 0.1399761261544261,
"grad_norm": 0.22373630106449127,
"learning_rate": 9.596313440518432e-05,
"loss": 1.0946,
"step": 557
},
{
"epoch": 0.14022742979204625,
"grad_norm": 0.5129598379135132,
"learning_rate": 9.594738197967259e-05,
"loss": 2.3418,
"step": 558
},
{
"epoch": 0.1404787334296664,
"grad_norm": 0.4349968433380127,
"learning_rate": 9.593160017790546e-05,
"loss": 2.0397,
"step": 559
},
{
"epoch": 0.14073003706728654,
"grad_norm": 0.39511239528656006,
"learning_rate": 9.591578900997292e-05,
"loss": 2.2823,
"step": 560
},
{
"epoch": 0.1409813407049067,
"grad_norm": 0.25913137197494507,
"learning_rate": 9.58999484859838e-05,
"loss": 2.3511,
"step": 561
},
{
"epoch": 0.14123264434252686,
"grad_norm": 0.19200782477855682,
"learning_rate": 9.588407861606566e-05,
"loss": 1.6859,
"step": 562
},
{
"epoch": 0.14148394798014702,
"grad_norm": 0.30504798889160156,
"learning_rate": 9.586817941036483e-05,
"loss": 2.1132,
"step": 563
},
{
"epoch": 0.14173525161776718,
"grad_norm": 0.46058428287506104,
"learning_rate": 9.585225087904641e-05,
"loss": 2.4114,
"step": 564
},
{
"epoch": 0.1419865552553873,
"grad_norm": 0.2597549557685852,
"learning_rate": 9.583629303229423e-05,
"loss": 2.4956,
"step": 565
},
{
"epoch": 0.14223785889300747,
"grad_norm": 0.35064586997032166,
"learning_rate": 9.582030588031084e-05,
"loss": 2.4188,
"step": 566
},
{
"epoch": 0.14248916253062763,
"grad_norm": 0.21857944130897522,
"learning_rate": 9.580428943331758e-05,
"loss": 1.4052,
"step": 567
},
{
"epoch": 0.1427404661682478,
"grad_norm": 0.4945662021636963,
"learning_rate": 9.578824370155451e-05,
"loss": 2.6965,
"step": 568
},
{
"epoch": 0.14299176980586795,
"grad_norm": 0.37999972701072693,
"learning_rate": 9.577216869528038e-05,
"loss": 2.5524,
"step": 569
},
{
"epoch": 0.1432430734434881,
"grad_norm": 0.47479113936424255,
"learning_rate": 9.575606442477267e-05,
"loss": 2.166,
"step": 570
},
{
"epoch": 0.14349437708110824,
"grad_norm": 0.42284709215164185,
"learning_rate": 9.573993090032758e-05,
"loss": 1.7279,
"step": 571
},
{
"epoch": 0.1437456807187284,
"grad_norm": 0.412218302488327,
"learning_rate": 9.572376813225999e-05,
"loss": 2.1049,
"step": 572
},
{
"epoch": 0.14399698435634856,
"grad_norm": 0.6507567167282104,
"learning_rate": 9.570757613090353e-05,
"loss": 2.5453,
"step": 573
},
{
"epoch": 0.14424828799396872,
"grad_norm": 0.31104111671447754,
"learning_rate": 9.569135490661046e-05,
"loss": 2.6578,
"step": 574
},
{
"epoch": 0.14449959163158888,
"grad_norm": 0.3139590620994568,
"learning_rate": 9.567510446975176e-05,
"loss": 2.5413,
"step": 575
},
{
"epoch": 0.14475089526920903,
"grad_norm": 0.5592718720436096,
"learning_rate": 9.565882483071706e-05,
"loss": 2.3341,
"step": 576
},
{
"epoch": 0.14500219890682917,
"grad_norm": 0.4113386869430542,
"learning_rate": 9.564251599991467e-05,
"loss": 2.7414,
"step": 577
},
{
"epoch": 0.14525350254444933,
"grad_norm": 0.2924419045448303,
"learning_rate": 9.56261779877716e-05,
"loss": 1.882,
"step": 578
},
{
"epoch": 0.14550480618206948,
"grad_norm": 0.3123188316822052,
"learning_rate": 9.560981080473346e-05,
"loss": 2.2884,
"step": 579
},
{
"epoch": 0.14575610981968964,
"grad_norm": 0.3279459476470947,
"learning_rate": 9.559341446126455e-05,
"loss": 1.8245,
"step": 580
},
{
"epoch": 0.1460074134573098,
"grad_norm": 0.2936881184577942,
"learning_rate": 9.55769889678478e-05,
"loss": 2.2238,
"step": 581
},
{
"epoch": 0.14625871709492994,
"grad_norm": 0.3411659598350525,
"learning_rate": 9.556053433498475e-05,
"loss": 2.1781,
"step": 582
},
{
"epoch": 0.1465100207325501,
"grad_norm": 0.42901405692100525,
"learning_rate": 9.554405057319565e-05,
"loss": 2.5198,
"step": 583
},
{
"epoch": 0.14676132437017025,
"grad_norm": 0.37800028920173645,
"learning_rate": 9.552753769301925e-05,
"loss": 2.009,
"step": 584
},
{
"epoch": 0.1470126280077904,
"grad_norm": 0.35843050479888916,
"learning_rate": 9.551099570501305e-05,
"loss": 2.0567,
"step": 585
},
{
"epoch": 0.14726393164541057,
"grad_norm": 0.4403095245361328,
"learning_rate": 9.549442461975306e-05,
"loss": 2.242,
"step": 586
},
{
"epoch": 0.14751523528303073,
"grad_norm": 0.3680509626865387,
"learning_rate": 9.547782444783393e-05,
"loss": 1.9327,
"step": 587
},
{
"epoch": 0.14776653892065086,
"grad_norm": 0.26541033387184143,
"learning_rate": 9.546119519986894e-05,
"loss": 2.5879,
"step": 588
},
{
"epoch": 0.14801784255827102,
"grad_norm": 0.3599735200405121,
"learning_rate": 9.544453688648989e-05,
"loss": 2.448,
"step": 589
},
{
"epoch": 0.14826914619589118,
"grad_norm": 0.46123915910720825,
"learning_rate": 9.542784951834721e-05,
"loss": 2.1933,
"step": 590
},
{
"epoch": 0.14852044983351134,
"grad_norm": 0.3718903064727783,
"learning_rate": 9.54111331061099e-05,
"loss": 2.0662,
"step": 591
},
{
"epoch": 0.1487717534711315,
"grad_norm": 0.5671700239181519,
"learning_rate": 9.539438766046554e-05,
"loss": 2.2881,
"step": 592
},
{
"epoch": 0.14902305710875166,
"grad_norm": 0.415791779756546,
"learning_rate": 9.537761319212021e-05,
"loss": 2.1724,
"step": 593
},
{
"epoch": 0.1492743607463718,
"grad_norm": 0.33377009630203247,
"learning_rate": 9.536080971179864e-05,
"loss": 2.2665,
"step": 594
},
{
"epoch": 0.14952566438399195,
"grad_norm": 0.48479607701301575,
"learning_rate": 9.534397723024402e-05,
"loss": 1.9262,
"step": 595
},
{
"epoch": 0.1497769680216121,
"grad_norm": 0.4698795974254608,
"learning_rate": 9.532711575821816e-05,
"loss": 2.3923,
"step": 596
},
{
"epoch": 0.15002827165923227,
"grad_norm": 0.45373061299324036,
"learning_rate": 9.531022530650135e-05,
"loss": 2.2584,
"step": 597
},
{
"epoch": 0.15027957529685243,
"grad_norm": 0.25719451904296875,
"learning_rate": 9.529330588589243e-05,
"loss": 2.1529,
"step": 598
},
{
"epoch": 0.1505308789344726,
"grad_norm": 0.27690425515174866,
"learning_rate": 9.527635750720875e-05,
"loss": 2.1076,
"step": 599
},
{
"epoch": 0.15078218257209272,
"grad_norm": 0.11603706330060959,
"learning_rate": 9.525938018128617e-05,
"loss": 0.5727,
"step": 600
},
{
"epoch": 0.15103348620971288,
"grad_norm": 0.39744696021080017,
"learning_rate": 9.524237391897909e-05,
"loss": 2.1513,
"step": 601
},
{
"epoch": 0.15128478984733304,
"grad_norm": 0.3867836594581604,
"learning_rate": 9.522533873116041e-05,
"loss": 2.3665,
"step": 602
},
{
"epoch": 0.1515360934849532,
"grad_norm": 0.16841183602809906,
"learning_rate": 9.520827462872144e-05,
"loss": 1.1614,
"step": 603
},
{
"epoch": 0.15178739712257336,
"grad_norm": 0.34245559573173523,
"learning_rate": 9.519118162257209e-05,
"loss": 2.2823,
"step": 604
},
{
"epoch": 0.1520387007601935,
"grad_norm": 0.47992077469825745,
"learning_rate": 9.517405972364067e-05,
"loss": 1.9997,
"step": 605
},
{
"epoch": 0.15229000439781365,
"grad_norm": 0.2888699471950531,
"learning_rate": 9.5156908942874e-05,
"loss": 2.2298,
"step": 606
},
{
"epoch": 0.1525413080354338,
"grad_norm": 0.1741029918193817,
"learning_rate": 9.513972929123737e-05,
"loss": 0.7344,
"step": 607
},
{
"epoch": 0.15279261167305397,
"grad_norm": 0.3610229790210724,
"learning_rate": 9.512252077971448e-05,
"loss": 2.0037,
"step": 608
},
{
"epoch": 0.15304391531067413,
"grad_norm": 0.30153217911720276,
"learning_rate": 9.510528341930756e-05,
"loss": 2.1396,
"step": 609
},
{
"epoch": 0.1532952189482943,
"grad_norm": 0.5049226880073547,
"learning_rate": 9.50880172210372e-05,
"loss": 2.1197,
"step": 610
},
{
"epoch": 0.15354652258591442,
"grad_norm": 0.27209773659706116,
"learning_rate": 9.507072219594249e-05,
"loss": 2.37,
"step": 611
},
{
"epoch": 0.15379782622353458,
"grad_norm": 0.33535251021385193,
"learning_rate": 9.505339835508091e-05,
"loss": 2.4393,
"step": 612
},
{
"epoch": 0.15404912986115474,
"grad_norm": 0.4262019991874695,
"learning_rate": 9.50360457095284e-05,
"loss": 2.0283,
"step": 613
},
{
"epoch": 0.1543004334987749,
"grad_norm": 0.5614545941352844,
"learning_rate": 9.50186642703793e-05,
"loss": 2.0785,
"step": 614
},
{
"epoch": 0.15455173713639506,
"grad_norm": 0.3979303240776062,
"learning_rate": 9.500125404874631e-05,
"loss": 2.6146,
"step": 615
},
{
"epoch": 0.15480304077401522,
"grad_norm": 0.39508217573165894,
"learning_rate": 9.498381505576064e-05,
"loss": 2.2959,
"step": 616
},
{
"epoch": 0.15505434441163535,
"grad_norm": 0.36526814103126526,
"learning_rate": 9.49663473025718e-05,
"loss": 2.3149,
"step": 617
},
{
"epoch": 0.1553056480492555,
"grad_norm": 0.6422840356826782,
"learning_rate": 9.494885080034774e-05,
"loss": 2.0929,
"step": 618
},
{
"epoch": 0.15555695168687567,
"grad_norm": 0.32328036427497864,
"learning_rate": 9.493132556027475e-05,
"loss": 2.2233,
"step": 619
},
{
"epoch": 0.15580825532449583,
"grad_norm": 0.25220245122909546,
"learning_rate": 9.491377159355752e-05,
"loss": 2.5095,
"step": 620
},
{
"epoch": 0.15605955896211599,
"grad_norm": 0.32473161816596985,
"learning_rate": 9.489618891141911e-05,
"loss": 2.6889,
"step": 621
},
{
"epoch": 0.15631086259973614,
"grad_norm": 0.4301775395870209,
"learning_rate": 9.487857752510093e-05,
"loss": 2.227,
"step": 622
},
{
"epoch": 0.15656216623735628,
"grad_norm": 0.4302082061767578,
"learning_rate": 9.486093744586271e-05,
"loss": 2.2468,
"step": 623
},
{
"epoch": 0.15681346987497644,
"grad_norm": 0.39020925760269165,
"learning_rate": 9.484326868498261e-05,
"loss": 2.3726,
"step": 624
},
{
"epoch": 0.1570647735125966,
"grad_norm": 0.3023484945297241,
"learning_rate": 9.482557125375704e-05,
"loss": 2.4235,
"step": 625
},
{
"epoch": 0.15731607715021675,
"grad_norm": 0.3555540442466736,
"learning_rate": 9.480784516350079e-05,
"loss": 2.4532,
"step": 626
},
{
"epoch": 0.1575673807878369,
"grad_norm": 0.3152056634426117,
"learning_rate": 9.479009042554694e-05,
"loss": 2.7172,
"step": 627
},
{
"epoch": 0.15781868442545705,
"grad_norm": 0.5882668495178223,
"learning_rate": 9.477230705124692e-05,
"loss": 2.5912,
"step": 628
},
{
"epoch": 0.1580699880630772,
"grad_norm": 0.36808767914772034,
"learning_rate": 9.475449505197043e-05,
"loss": 2.2798,
"step": 629
},
{
"epoch": 0.15832129170069736,
"grad_norm": 0.44441547989845276,
"learning_rate": 9.473665443910551e-05,
"loss": 1.9648,
"step": 630
},
{
"epoch": 0.15857259533831752,
"grad_norm": 0.40813305974006653,
"learning_rate": 9.471878522405849e-05,
"loss": 1.7885,
"step": 631
},
{
"epoch": 0.15882389897593768,
"grad_norm": 0.32029062509536743,
"learning_rate": 9.470088741825394e-05,
"loss": 2.6975,
"step": 632
},
{
"epoch": 0.15907520261355784,
"grad_norm": 0.270475834608078,
"learning_rate": 9.468296103313476e-05,
"loss": 0.8822,
"step": 633
},
{
"epoch": 0.15932650625117797,
"grad_norm": 0.3409525454044342,
"learning_rate": 9.46650060801621e-05,
"loss": 1.7725,
"step": 634
},
{
"epoch": 0.15957780988879813,
"grad_norm": 0.4669889807701111,
"learning_rate": 9.464702257081539e-05,
"loss": 2.379,
"step": 635
},
{
"epoch": 0.1598291135264183,
"grad_norm": 0.5293301939964294,
"learning_rate": 9.462901051659232e-05,
"loss": 2.3118,
"step": 636
},
{
"epoch": 0.16008041716403845,
"grad_norm": 0.38586729764938354,
"learning_rate": 9.461096992900879e-05,
"loss": 2.0312,
"step": 637
},
{
"epoch": 0.1603317208016586,
"grad_norm": 0.3819951117038727,
"learning_rate": 9.459290081959897e-05,
"loss": 1.973,
"step": 638
},
{
"epoch": 0.16058302443927877,
"grad_norm": 0.37507110834121704,
"learning_rate": 9.457480319991529e-05,
"loss": 2.3668,
"step": 639
},
{
"epoch": 0.1608343280768989,
"grad_norm": 0.3354203701019287,
"learning_rate": 9.455667708152836e-05,
"loss": 1.7392,
"step": 640
},
{
"epoch": 0.16108563171451906,
"grad_norm": 0.31567585468292236,
"learning_rate": 9.453852247602704e-05,
"loss": 2.2258,
"step": 641
},
{
"epoch": 0.16133693535213922,
"grad_norm": 0.35445067286491394,
"learning_rate": 9.452033939501839e-05,
"loss": 2.5792,
"step": 642
},
{
"epoch": 0.16158823898975938,
"grad_norm": 0.3172896206378937,
"learning_rate": 9.45021278501277e-05,
"loss": 2.4493,
"step": 643
},
{
"epoch": 0.16183954262737954,
"grad_norm": 0.4169299602508545,
"learning_rate": 9.448388785299842e-05,
"loss": 2.3125,
"step": 644
},
{
"epoch": 0.1620908462649997,
"grad_norm": 0.5329498648643494,
"learning_rate": 9.446561941529224e-05,
"loss": 2.1722,
"step": 645
},
{
"epoch": 0.16234214990261983,
"grad_norm": 0.21547356247901917,
"learning_rate": 9.444732254868898e-05,
"loss": 2.1758,
"step": 646
},
{
"epoch": 0.16259345354024,
"grad_norm": 0.29865655303001404,
"learning_rate": 9.442899726488665e-05,
"loss": 1.5479,
"step": 647
},
{
"epoch": 0.16284475717786015,
"grad_norm": 0.38035672903060913,
"learning_rate": 9.441064357560147e-05,
"loss": 2.8413,
"step": 648
},
{
"epoch": 0.1630960608154803,
"grad_norm": 0.6323506832122803,
"learning_rate": 9.439226149256779e-05,
"loss": 2.1557,
"step": 649
},
{
"epoch": 0.16334736445310047,
"grad_norm": 0.5052780508995056,
"learning_rate": 9.43738510275381e-05,
"loss": 1.7929,
"step": 650
},
{
"epoch": 0.1635986680907206,
"grad_norm": 0.3237319588661194,
"learning_rate": 9.435541219228303e-05,
"loss": 2.336,
"step": 651
},
{
"epoch": 0.16384997172834076,
"grad_norm": 0.18192382156848907,
"learning_rate": 9.433694499859141e-05,
"loss": 0.9273,
"step": 652
},
{
"epoch": 0.16410127536596092,
"grad_norm": 0.31093844771385193,
"learning_rate": 9.431844945827014e-05,
"loss": 2.633,
"step": 653
},
{
"epoch": 0.16435257900358108,
"grad_norm": 0.40673384070396423,
"learning_rate": 9.429992558314423e-05,
"loss": 1.8868,
"step": 654
},
{
"epoch": 0.16460388264120124,
"grad_norm": 0.30386146903038025,
"learning_rate": 9.428137338505687e-05,
"loss": 2.1055,
"step": 655
},
{
"epoch": 0.1648551862788214,
"grad_norm": 0.3704879581928253,
"learning_rate": 9.426279287586934e-05,
"loss": 1.9851,
"step": 656
},
{
"epoch": 0.16510648991644153,
"grad_norm": 0.17675453424453735,
"learning_rate": 9.424418406746098e-05,
"loss": 1.0456,
"step": 657
},
{
"epoch": 0.1653577935540617,
"grad_norm": 0.2338314950466156,
"learning_rate": 9.422554697172925e-05,
"loss": 1.8722,
"step": 658
},
{
"epoch": 0.16560909719168185,
"grad_norm": 0.2262151688337326,
"learning_rate": 9.420688160058972e-05,
"loss": 1.2118,
"step": 659
},
{
"epoch": 0.165860400829302,
"grad_norm": 0.32973191142082214,
"learning_rate": 9.418818796597597e-05,
"loss": 1.974,
"step": 660
},
{
"epoch": 0.16611170446692217,
"grad_norm": 0.3016055226325989,
"learning_rate": 9.416946607983975e-05,
"loss": 1.8877,
"step": 661
},
{
"epoch": 0.16636300810454233,
"grad_norm": 0.39721089601516724,
"learning_rate": 9.415071595415075e-05,
"loss": 2.1139,
"step": 662
},
{
"epoch": 0.16661431174216246,
"grad_norm": 0.3398868143558502,
"learning_rate": 9.413193760089682e-05,
"loss": 1.9081,
"step": 663
},
{
"epoch": 0.16686561537978262,
"grad_norm": 0.3406602144241333,
"learning_rate": 9.411313103208382e-05,
"loss": 2.1456,
"step": 664
},
{
"epoch": 0.16711691901740278,
"grad_norm": 0.4096097946166992,
"learning_rate": 9.409429625973563e-05,
"loss": 2.294,
"step": 665
},
{
"epoch": 0.16736822265502294,
"grad_norm": 0.2669360041618347,
"learning_rate": 9.407543329589418e-05,
"loss": 1.8453,
"step": 666
},
{
"epoch": 0.1676195262926431,
"grad_norm": 0.5418170690536499,
"learning_rate": 9.405654215261944e-05,
"loss": 2.3642,
"step": 667
},
{
"epoch": 0.16787082993026323,
"grad_norm": 0.32345104217529297,
"learning_rate": 9.403762284198936e-05,
"loss": 2.1211,
"step": 668
},
{
"epoch": 0.1681221335678834,
"grad_norm": 0.49764448404312134,
"learning_rate": 9.401867537609991e-05,
"loss": 1.9661,
"step": 669
},
{
"epoch": 0.16837343720550355,
"grad_norm": 1.5639890432357788,
"learning_rate": 9.399969976706509e-05,
"loss": 1.9938,
"step": 670
},
{
"epoch": 0.1686247408431237,
"grad_norm": 1.2142359018325806,
"learning_rate": 9.398069602701687e-05,
"loss": 2.148,
"step": 671
},
{
"epoch": 0.16887604448074386,
"grad_norm": 0.2211383879184723,
"learning_rate": 9.396166416810519e-05,
"loss": 2.0333,
"step": 672
},
{
"epoch": 0.16912734811836402,
"grad_norm": 0.3100007176399231,
"learning_rate": 9.394260420249801e-05,
"loss": 2.529,
"step": 673
},
{
"epoch": 0.16937865175598416,
"grad_norm": 0.39237165451049805,
"learning_rate": 9.39235161423812e-05,
"loss": 2.2316,
"step": 674
},
{
"epoch": 0.16962995539360431,
"grad_norm": 0.553925096988678,
"learning_rate": 9.390439999995865e-05,
"loss": 2.1865,
"step": 675
},
{
"epoch": 0.16988125903122447,
"grad_norm": 0.45522618293762207,
"learning_rate": 9.38852557874522e-05,
"loss": 2.346,
"step": 676
},
{
"epoch": 0.17013256266884463,
"grad_norm": 0.4678781032562256,
"learning_rate": 9.386608351710157e-05,
"loss": 1.9335,
"step": 677
},
{
"epoch": 0.1703838663064648,
"grad_norm": 0.3529011905193329,
"learning_rate": 9.38468832011645e-05,
"loss": 2.6414,
"step": 678
},
{
"epoch": 0.17063516994408495,
"grad_norm": 0.26705560088157654,
"learning_rate": 9.382765485191662e-05,
"loss": 2.3328,
"step": 679
},
{
"epoch": 0.17088647358170508,
"grad_norm": 0.3495092988014221,
"learning_rate": 9.380839848165149e-05,
"loss": 2.3455,
"step": 680
},
{
"epoch": 0.17113777721932524,
"grad_norm": 0.32814642786979675,
"learning_rate": 9.378911410268058e-05,
"loss": 2.4395,
"step": 681
},
{
"epoch": 0.1713890808569454,
"grad_norm": 0.33247315883636475,
"learning_rate": 9.376980172733329e-05,
"loss": 1.6115,
"step": 682
},
{
"epoch": 0.17164038449456556,
"grad_norm": 0.44238927960395813,
"learning_rate": 9.375046136795686e-05,
"loss": 2.1676,
"step": 683
},
{
"epoch": 0.17189168813218572,
"grad_norm": 0.3289899528026581,
"learning_rate": 9.373109303691652e-05,
"loss": 2.3906,
"step": 684
},
{
"epoch": 0.17214299176980588,
"grad_norm": 0.3287547826766968,
"learning_rate": 9.371169674659529e-05,
"loss": 2.1608,
"step": 685
},
{
"epoch": 0.172394295407426,
"grad_norm": 0.2546299397945404,
"learning_rate": 9.36922725093941e-05,
"loss": 2.4981,
"step": 686
},
{
"epoch": 0.17264559904504617,
"grad_norm": 0.2905830144882202,
"learning_rate": 9.367282033773177e-05,
"loss": 2.3735,
"step": 687
},
{
"epoch": 0.17289690268266633,
"grad_norm": 0.5941200256347656,
"learning_rate": 9.365334024404495e-05,
"loss": 2.7027,
"step": 688
},
{
"epoch": 0.1731482063202865,
"grad_norm": 0.3145928680896759,
"learning_rate": 9.363383224078814e-05,
"loss": 2.1827,
"step": 689
},
{
"epoch": 0.17339950995790665,
"grad_norm": 0.3063415586948395,
"learning_rate": 9.361429634043372e-05,
"loss": 2.0369,
"step": 690
},
{
"epoch": 0.17365081359552678,
"grad_norm": 0.40637439489364624,
"learning_rate": 9.359473255547186e-05,
"loss": 2.4201,
"step": 691
},
{
"epoch": 0.17390211723314694,
"grad_norm": 0.5028929710388184,
"learning_rate": 9.357514089841061e-05,
"loss": 1.7536,
"step": 692
},
{
"epoch": 0.1741534208707671,
"grad_norm": 0.19582106173038483,
"learning_rate": 9.355552138177577e-05,
"loss": 1.6803,
"step": 693
},
{
"epoch": 0.17440472450838726,
"grad_norm": 0.42780154943466187,
"learning_rate": 9.353587401811101e-05,
"loss": 2.0536,
"step": 694
},
{
"epoch": 0.17465602814600742,
"grad_norm": 0.41308510303497314,
"learning_rate": 9.351619881997779e-05,
"loss": 2.1088,
"step": 695
},
{
"epoch": 0.17490733178362758,
"grad_norm": 0.37217557430267334,
"learning_rate": 9.349649579995536e-05,
"loss": 2.3313,
"step": 696
},
{
"epoch": 0.1751586354212477,
"grad_norm": 0.49139076471328735,
"learning_rate": 9.347676497064074e-05,
"loss": 2.048,
"step": 697
},
{
"epoch": 0.17540993905886787,
"grad_norm": 0.49698978662490845,
"learning_rate": 9.345700634464876e-05,
"loss": 2.2736,
"step": 698
},
{
"epoch": 0.17566124269648803,
"grad_norm": 0.3499569296836853,
"learning_rate": 9.343721993461203e-05,
"loss": 2.4421,
"step": 699
},
{
"epoch": 0.1759125463341082,
"grad_norm": 0.2008151412010193,
"learning_rate": 9.341740575318088e-05,
"loss": 1.8728,
"step": 700
},
{
"epoch": 0.17616384997172835,
"grad_norm": 0.43613699078559875,
"learning_rate": 9.339756381302341e-05,
"loss": 2.1268,
"step": 701
},
{
"epoch": 0.1764151536093485,
"grad_norm": 0.24621760845184326,
"learning_rate": 9.337769412682551e-05,
"loss": 2.3428,
"step": 702
},
{
"epoch": 0.17666645724696864,
"grad_norm": 0.6878277659416199,
"learning_rate": 9.335779670729075e-05,
"loss": 2.1971,
"step": 703
},
{
"epoch": 0.1769177608845888,
"grad_norm": 0.3094477653503418,
"learning_rate": 9.333787156714047e-05,
"loss": 1.6276,
"step": 704
},
{
"epoch": 0.17716906452220896,
"grad_norm": 0.2712198495864868,
"learning_rate": 9.331791871911371e-05,
"loss": 2.1207,
"step": 705
},
{
"epoch": 0.17742036815982912,
"grad_norm": 0.40410006046295166,
"learning_rate": 9.329793817596724e-05,
"loss": 2.5001,
"step": 706
},
{
"epoch": 0.17767167179744928,
"grad_norm": 0.37285852432250977,
"learning_rate": 9.327792995047553e-05,
"loss": 2.513,
"step": 707
},
{
"epoch": 0.17792297543506944,
"grad_norm": 0.3589307963848114,
"learning_rate": 9.325789405543075e-05,
"loss": 2.3407,
"step": 708
},
{
"epoch": 0.17817427907268957,
"grad_norm": 0.16013433039188385,
"learning_rate": 9.323783050364276e-05,
"loss": 1.0699,
"step": 709
},
{
"epoch": 0.17842558271030973,
"grad_norm": 0.3747367858886719,
"learning_rate": 9.321773930793914e-05,
"loss": 2.2041,
"step": 710
},
{
"epoch": 0.1786768863479299,
"grad_norm": 0.48945263028144836,
"learning_rate": 9.319762048116503e-05,
"loss": 1.8284,
"step": 711
},
{
"epoch": 0.17892818998555005,
"grad_norm": 0.5572097897529602,
"learning_rate": 9.317747403618337e-05,
"loss": 1.8432,
"step": 712
},
{
"epoch": 0.1791794936231702,
"grad_norm": 0.4351899027824402,
"learning_rate": 9.31572999858747e-05,
"loss": 2.0469,
"step": 713
},
{
"epoch": 0.17943079726079034,
"grad_norm": 0.747898519039154,
"learning_rate": 9.31370983431372e-05,
"loss": 2.3444,
"step": 714
},
{
"epoch": 0.1796821008984105,
"grad_norm": 0.3677506148815155,
"learning_rate": 9.311686912088669e-05,
"loss": 1.7389,
"step": 715
},
{
"epoch": 0.17993340453603066,
"grad_norm": 0.7408022880554199,
"learning_rate": 9.309661233205663e-05,
"loss": 2.3839,
"step": 716
},
{
"epoch": 0.18018470817365081,
"grad_norm": 0.46684297919273376,
"learning_rate": 9.307632798959813e-05,
"loss": 2.4899,
"step": 717
},
{
"epoch": 0.18043601181127097,
"grad_norm": 0.5166415572166443,
"learning_rate": 9.305601610647989e-05,
"loss": 2.2496,
"step": 718
},
{
"epoch": 0.18068731544889113,
"grad_norm": 0.17087407410144806,
"learning_rate": 9.30356766956882e-05,
"loss": 0.624,
"step": 719
},
{
"epoch": 0.18093861908651127,
"grad_norm": 0.3293837904930115,
"learning_rate": 9.301530977022701e-05,
"loss": 2.1589,
"step": 720
},
{
"epoch": 0.18118992272413142,
"grad_norm": 0.5174190402030945,
"learning_rate": 9.29949153431178e-05,
"loss": 1.9908,
"step": 721
},
{
"epoch": 0.18144122636175158,
"grad_norm": 0.6492531895637512,
"learning_rate": 9.297449342739964e-05,
"loss": 2.3542,
"step": 722
},
{
"epoch": 0.18169252999937174,
"grad_norm": 0.3231172263622284,
"learning_rate": 9.295404403612924e-05,
"loss": 2.0346,
"step": 723
},
{
"epoch": 0.1819438336369919,
"grad_norm": 0.3993067145347595,
"learning_rate": 9.293356718238077e-05,
"loss": 2.0821,
"step": 724
},
{
"epoch": 0.18219513727461206,
"grad_norm": 0.3920503854751587,
"learning_rate": 9.291306287924608e-05,
"loss": 2.1273,
"step": 725
},
{
"epoch": 0.1824464409122322,
"grad_norm": 0.3167310655117035,
"learning_rate": 9.289253113983444e-05,
"loss": 2.0075,
"step": 726
},
{
"epoch": 0.18269774454985235,
"grad_norm": 0.3710818290710449,
"learning_rate": 9.287197197727277e-05,
"loss": 2.0204,
"step": 727
},
{
"epoch": 0.1829490481874725,
"grad_norm": 0.18097934126853943,
"learning_rate": 9.285138540470546e-05,
"loss": 0.749,
"step": 728
},
{
"epoch": 0.18320035182509267,
"grad_norm": 0.18432289361953735,
"learning_rate": 9.283077143529446e-05,
"loss": 0.8822,
"step": 729
},
{
"epoch": 0.18345165546271283,
"grad_norm": 0.3951958417892456,
"learning_rate": 9.281013008221921e-05,
"loss": 2.1613,
"step": 730
},
{
"epoch": 0.183702959100333,
"grad_norm": 0.3688110113143921,
"learning_rate": 9.278946135867665e-05,
"loss": 2.8058,
"step": 731
},
{
"epoch": 0.18395426273795312,
"grad_norm": 0.29696959257125854,
"learning_rate": 9.276876527788127e-05,
"loss": 1.7129,
"step": 732
},
{
"epoch": 0.18420556637557328,
"grad_norm": 0.405823677778244,
"learning_rate": 9.274804185306503e-05,
"loss": 2.334,
"step": 733
},
{
"epoch": 0.18445687001319344,
"grad_norm": 0.4110073447227478,
"learning_rate": 9.27272910974773e-05,
"loss": 2.4929,
"step": 734
},
{
"epoch": 0.1847081736508136,
"grad_norm": 0.3715936541557312,
"learning_rate": 9.270651302438502e-05,
"loss": 1.7891,
"step": 735
},
{
"epoch": 0.18495947728843376,
"grad_norm": 0.1682804673910141,
"learning_rate": 9.268570764707257e-05,
"loss": 0.726,
"step": 736
},
{
"epoch": 0.1852107809260539,
"grad_norm": 0.2395019680261612,
"learning_rate": 9.266487497884176e-05,
"loss": 2.4391,
"step": 737
},
{
"epoch": 0.18546208456367405,
"grad_norm": 0.3601885437965393,
"learning_rate": 9.264401503301185e-05,
"loss": 2.5297,
"step": 738
},
{
"epoch": 0.1857133882012942,
"grad_norm": 0.26369959115982056,
"learning_rate": 9.262312782291959e-05,
"loss": 1.7459,
"step": 739
},
{
"epoch": 0.18596469183891437,
"grad_norm": 0.15436404943466187,
"learning_rate": 9.26022133619191e-05,
"loss": 1.0022,
"step": 740
},
{
"epoch": 0.18621599547653453,
"grad_norm": 0.3303896486759186,
"learning_rate": 9.258127166338196e-05,
"loss": 2.3745,
"step": 741
},
{
"epoch": 0.1864672991141547,
"grad_norm": 0.3118177056312561,
"learning_rate": 9.256030274069713e-05,
"loss": 2.8046,
"step": 742
},
{
"epoch": 0.18671860275177482,
"grad_norm": 0.17974944412708282,
"learning_rate": 9.253930660727104e-05,
"loss": 0.8354,
"step": 743
},
{
"epoch": 0.18696990638939498,
"grad_norm": 0.2209557741880417,
"learning_rate": 9.251828327652742e-05,
"loss": 2.1792,
"step": 744
},
{
"epoch": 0.18722121002701514,
"grad_norm": 0.4132773280143738,
"learning_rate": 9.24972327619075e-05,
"loss": 2.3944,
"step": 745
},
{
"epoch": 0.1874725136646353,
"grad_norm": 0.47139856219291687,
"learning_rate": 9.24761550768698e-05,
"loss": 2.3182,
"step": 746
},
{
"epoch": 0.18772381730225546,
"grad_norm": 0.40354403853416443,
"learning_rate": 9.245505023489024e-05,
"loss": 2.1719,
"step": 747
},
{
"epoch": 0.18797512093987562,
"grad_norm": 0.23668596148490906,
"learning_rate": 9.243391824946213e-05,
"loss": 1.9976,
"step": 748
},
{
"epoch": 0.18822642457749575,
"grad_norm": 0.48701080679893494,
"learning_rate": 9.24127591340961e-05,
"loss": 2.3892,
"step": 749
},
{
"epoch": 0.1884777282151159,
"grad_norm": 0.48195892572402954,
"learning_rate": 9.239157290232014e-05,
"loss": 2.2488,
"step": 750
},
{
"epoch": 0.18872903185273607,
"grad_norm": 0.3957456946372986,
"learning_rate": 9.237035956767956e-05,
"loss": 2.2675,
"step": 751
},
{
"epoch": 0.18898033549035623,
"grad_norm": 0.419040709733963,
"learning_rate": 9.234911914373702e-05,
"loss": 1.9331,
"step": 752
},
{
"epoch": 0.1892316391279764,
"grad_norm": 0.3198854327201843,
"learning_rate": 9.23278516440725e-05,
"loss": 1.9211,
"step": 753
},
{
"epoch": 0.18948294276559652,
"grad_norm": 0.4320249855518341,
"learning_rate": 9.230655708228328e-05,
"loss": 1.9932,
"step": 754
},
{
"epoch": 0.18973424640321668,
"grad_norm": 0.34588703513145447,
"learning_rate": 9.228523547198393e-05,
"loss": 1.6818,
"step": 755
},
{
"epoch": 0.18998555004083684,
"grad_norm": 0.17924979329109192,
"learning_rate": 9.226388682680633e-05,
"loss": 1.2715,
"step": 756
},
{
"epoch": 0.190236853678457,
"grad_norm": 0.3479664921760559,
"learning_rate": 9.224251116039965e-05,
"loss": 2.6595,
"step": 757
},
{
"epoch": 0.19048815731607716,
"grad_norm": 0.9396395087242126,
"learning_rate": 9.222110848643035e-05,
"loss": 2.1373,
"step": 758
},
{
"epoch": 0.19073946095369732,
"grad_norm": 0.423880934715271,
"learning_rate": 9.219967881858209e-05,
"loss": 2.0013,
"step": 759
},
{
"epoch": 0.19099076459131745,
"grad_norm": 0.18442866206169128,
"learning_rate": 9.217822217055586e-05,
"loss": 1.1016,
"step": 760
},
{
"epoch": 0.1912420682289376,
"grad_norm": 0.33031755685806274,
"learning_rate": 9.215673855606986e-05,
"loss": 2.208,
"step": 761
},
{
"epoch": 0.19149337186655777,
"grad_norm": 0.5207613706588745,
"learning_rate": 9.213522798885956e-05,
"loss": 2.0212,
"step": 762
},
{
"epoch": 0.19174467550417792,
"grad_norm": 0.29409703612327576,
"learning_rate": 9.211369048267764e-05,
"loss": 2.5577,
"step": 763
},
{
"epoch": 0.19199597914179808,
"grad_norm": 0.44755882024765015,
"learning_rate": 9.2092126051294e-05,
"loss": 2.1216,
"step": 764
},
{
"epoch": 0.19224728277941824,
"grad_norm": 0.33680227398872375,
"learning_rate": 9.207053470849576e-05,
"loss": 2.5058,
"step": 765
},
{
"epoch": 0.19249858641703838,
"grad_norm": 0.41669735312461853,
"learning_rate": 9.204891646808726e-05,
"loss": 2.5137,
"step": 766
},
{
"epoch": 0.19274989005465853,
"grad_norm": 0.4869091808795929,
"learning_rate": 9.202727134389004e-05,
"loss": 2.2094,
"step": 767
},
{
"epoch": 0.1930011936922787,
"grad_norm": 0.3771580159664154,
"learning_rate": 9.20055993497428e-05,
"loss": 2.5748,
"step": 768
},
{
"epoch": 0.19325249732989885,
"grad_norm": 0.4663945734500885,
"learning_rate": 9.198390049950143e-05,
"loss": 2.6845,
"step": 769
},
{
"epoch": 0.193503800967519,
"grad_norm": 0.6000380516052246,
"learning_rate": 9.196217480703899e-05,
"loss": 2.4598,
"step": 770
},
{
"epoch": 0.19375510460513917,
"grad_norm": 0.4322783946990967,
"learning_rate": 9.194042228624572e-05,
"loss": 2.5049,
"step": 771
},
{
"epoch": 0.1940064082427593,
"grad_norm": 0.194077730178833,
"learning_rate": 9.191864295102899e-05,
"loss": 1.5018,
"step": 772
},
{
"epoch": 0.19425771188037946,
"grad_norm": 0.28692805767059326,
"learning_rate": 9.189683681531333e-05,
"loss": 1.8483,
"step": 773
},
{
"epoch": 0.19450901551799962,
"grad_norm": 0.8345639109611511,
"learning_rate": 9.187500389304037e-05,
"loss": 1.8403,
"step": 774
},
{
"epoch": 0.19476031915561978,
"grad_norm": 0.3533509373664856,
"learning_rate": 9.185314419816892e-05,
"loss": 2.4375,
"step": 775
},
{
"epoch": 0.19501162279323994,
"grad_norm": 0.40252941846847534,
"learning_rate": 9.18312577446749e-05,
"loss": 2.1452,
"step": 776
},
{
"epoch": 0.19526292643086007,
"grad_norm": 0.4904803931713104,
"learning_rate": 9.180934454655126e-05,
"loss": 2.2475,
"step": 777
},
{
"epoch": 0.19551423006848023,
"grad_norm": 0.4086427688598633,
"learning_rate": 9.178740461780812e-05,
"loss": 1.9234,
"step": 778
},
{
"epoch": 0.1957655337061004,
"grad_norm": 0.32106295228004456,
"learning_rate": 9.176543797247271e-05,
"loss": 2.3433,
"step": 779
},
{
"epoch": 0.19601683734372055,
"grad_norm": 0.45663875341415405,
"learning_rate": 9.17434446245893e-05,
"loss": 2.4102,
"step": 780
},
{
"epoch": 0.1962681409813407,
"grad_norm": 0.16669417917728424,
"learning_rate": 9.17214245882192e-05,
"loss": 0.9148,
"step": 781
},
{
"epoch": 0.19651944461896087,
"grad_norm": 0.21185293793678284,
"learning_rate": 9.169937787744088e-05,
"loss": 2.032,
"step": 782
},
{
"epoch": 0.196770748256581,
"grad_norm": 0.28057172894477844,
"learning_rate": 9.167730450634975e-05,
"loss": 2.3357,
"step": 783
},
{
"epoch": 0.19702205189420116,
"grad_norm": 0.30073508620262146,
"learning_rate": 9.165520448905835e-05,
"loss": 1.9842,
"step": 784
},
{
"epoch": 0.19727335553182132,
"grad_norm": 0.5807662606239319,
"learning_rate": 9.163307783969624e-05,
"loss": 2.1852,
"step": 785
},
{
"epoch": 0.19752465916944148,
"grad_norm": 0.43151628971099854,
"learning_rate": 9.161092457240999e-05,
"loss": 2.3249,
"step": 786
},
{
"epoch": 0.19777596280706164,
"grad_norm": 0.31615135073661804,
"learning_rate": 9.158874470136319e-05,
"loss": 2.3183,
"step": 787
},
{
"epoch": 0.1980272664446818,
"grad_norm": 0.4318180978298187,
"learning_rate": 9.156653824073642e-05,
"loss": 2.0189,
"step": 788
},
{
"epoch": 0.19827857008230193,
"grad_norm": 0.6335446834564209,
"learning_rate": 9.154430520472731e-05,
"loss": 1.8264,
"step": 789
},
{
"epoch": 0.1985298737199221,
"grad_norm": 0.1641846001148224,
"learning_rate": 9.152204560755045e-05,
"loss": 1.3867,
"step": 790
},
{
"epoch": 0.19878117735754225,
"grad_norm": 0.44742926955223083,
"learning_rate": 9.149975946343741e-05,
"loss": 1.9269,
"step": 791
},
{
"epoch": 0.1990324809951624,
"grad_norm": 0.438804566860199,
"learning_rate": 9.147744678663672e-05,
"loss": 1.8561,
"step": 792
},
{
"epoch": 0.19928378463278257,
"grad_norm": 0.6063904166221619,
"learning_rate": 9.145510759141393e-05,
"loss": 2.1038,
"step": 793
},
{
"epoch": 0.19953508827040273,
"grad_norm": 0.3686808943748474,
"learning_rate": 9.143274189205147e-05,
"loss": 2.811,
"step": 794
},
{
"epoch": 0.19978639190802286,
"grad_norm": 0.45831429958343506,
"learning_rate": 9.141034970284877e-05,
"loss": 2.1029,
"step": 795
},
{
"epoch": 0.20003769554564302,
"grad_norm": 0.4418196678161621,
"learning_rate": 9.138793103812218e-05,
"loss": 1.9126,
"step": 796
},
{
"epoch": 0.20028899918326318,
"grad_norm": 0.23358654975891113,
"learning_rate": 9.136548591220495e-05,
"loss": 2.0087,
"step": 797
},
{
"epoch": 0.20054030282088334,
"grad_norm": 0.5014088749885559,
"learning_rate": 9.134301433944731e-05,
"loss": 2.1698,
"step": 798
},
{
"epoch": 0.2007916064585035,
"grad_norm": 0.48934677243232727,
"learning_rate": 9.132051633421632e-05,
"loss": 1.9628,
"step": 799
},
{
"epoch": 0.20104291009612363,
"grad_norm": 0.46975913643836975,
"learning_rate": 9.129799191089601e-05,
"loss": 2.1432,
"step": 800
},
{
"epoch": 0.2012942137337438,
"grad_norm": 0.33380022644996643,
"learning_rate": 9.127544108388725e-05,
"loss": 1.7332,
"step": 801
},
{
"epoch": 0.20154551737136395,
"grad_norm": 0.6146292090415955,
"learning_rate": 9.125286386760785e-05,
"loss": 2.2721,
"step": 802
},
{
"epoch": 0.2017968210089841,
"grad_norm": 0.14950276911258698,
"learning_rate": 9.12302602764924e-05,
"loss": 1.2602,
"step": 803
},
{
"epoch": 0.20204812464660427,
"grad_norm": 0.4614298641681671,
"learning_rate": 9.120763032499242e-05,
"loss": 2.2327,
"step": 804
},
{
"epoch": 0.20229942828422443,
"grad_norm": 0.249393031001091,
"learning_rate": 9.118497402757631e-05,
"loss": 1.8148,
"step": 805
},
{
"epoch": 0.20255073192184456,
"grad_norm": 0.41946738958358765,
"learning_rate": 9.116229139872922e-05,
"loss": 2.5221,
"step": 806
},
{
"epoch": 0.20280203555946472,
"grad_norm": 0.4390411078929901,
"learning_rate": 9.113958245295321e-05,
"loss": 2.1989,
"step": 807
},
{
"epoch": 0.20305333919708488,
"grad_norm": 0.5600268244743347,
"learning_rate": 9.111684720476717e-05,
"loss": 2.6773,
"step": 808
},
{
"epoch": 0.20330464283470503,
"grad_norm": 0.2843821346759796,
"learning_rate": 9.109408566870673e-05,
"loss": 1.9472,
"step": 809
},
{
"epoch": 0.2035559464723252,
"grad_norm": 0.3715212643146515,
"learning_rate": 9.107129785932443e-05,
"loss": 2.1466,
"step": 810
},
{
"epoch": 0.20380725010994535,
"grad_norm": 0.8551393151283264,
"learning_rate": 9.10484837911895e-05,
"loss": 2.199,
"step": 811
},
{
"epoch": 0.20405855374756549,
"grad_norm": 0.5052684545516968,
"learning_rate": 9.102564347888806e-05,
"loss": 1.9972,
"step": 812
},
{
"epoch": 0.20430985738518564,
"grad_norm": 0.24479907751083374,
"learning_rate": 9.100277693702294e-05,
"loss": 2.3708,
"step": 813
},
{
"epoch": 0.2045611610228058,
"grad_norm": 0.42139294743537903,
"learning_rate": 9.097988418021377e-05,
"loss": 1.9225,
"step": 814
},
{
"epoch": 0.20481246466042596,
"grad_norm": 0.342489629983902,
"learning_rate": 9.095696522309693e-05,
"loss": 2.7236,
"step": 815
},
{
"epoch": 0.20506376829804612,
"grad_norm": 0.47755831480026245,
"learning_rate": 9.093402008032554e-05,
"loss": 2.2168,
"step": 816
},
{
"epoch": 0.20531507193566628,
"grad_norm": 0.32807457447052,
"learning_rate": 9.09110487665695e-05,
"loss": 2.4625,
"step": 817
},
{
"epoch": 0.20556637557328641,
"grad_norm": 0.3581337034702301,
"learning_rate": 9.088805129651542e-05,
"loss": 2.6607,
"step": 818
},
{
"epoch": 0.20581767921090657,
"grad_norm": 0.24006719887256622,
"learning_rate": 9.08650276848666e-05,
"loss": 2.0933,
"step": 819
},
{
"epoch": 0.20606898284852673,
"grad_norm": 0.22163568437099457,
"learning_rate": 9.084197794634312e-05,
"loss": 2.0709,
"step": 820
},
{
"epoch": 0.2063202864861469,
"grad_norm": 0.20495516061782837,
"learning_rate": 9.081890209568169e-05,
"loss": 1.8137,
"step": 821
},
{
"epoch": 0.20657159012376705,
"grad_norm": 0.33006590604782104,
"learning_rate": 9.079580014763579e-05,
"loss": 2.242,
"step": 822
},
{
"epoch": 0.20682289376138718,
"grad_norm": 0.31708237528800964,
"learning_rate": 9.077267211697554e-05,
"loss": 2.5707,
"step": 823
},
{
"epoch": 0.20707419739900734,
"grad_norm": 0.3039303719997406,
"learning_rate": 9.07495180184877e-05,
"loss": 2.4013,
"step": 824
},
{
"epoch": 0.2073255010366275,
"grad_norm": 0.3243713974952698,
"learning_rate": 9.072633786697581e-05,
"loss": 1.9324,
"step": 825
},
{
"epoch": 0.20757680467424766,
"grad_norm": 0.376941055059433,
"learning_rate": 9.070313167725995e-05,
"loss": 1.9856,
"step": 826
},
{
"epoch": 0.20782810831186782,
"grad_norm": 0.4256725013256073,
"learning_rate": 9.06798994641769e-05,
"loss": 1.9451,
"step": 827
},
{
"epoch": 0.20807941194948798,
"grad_norm": 0.21601825952529907,
"learning_rate": 9.06566412425801e-05,
"loss": 0.9348,
"step": 828
},
{
"epoch": 0.2083307155871081,
"grad_norm": 0.5165765881538391,
"learning_rate": 9.063335702733958e-05,
"loss": 2.3604,
"step": 829
},
{
"epoch": 0.20858201922472827,
"grad_norm": 0.4365144670009613,
"learning_rate": 9.061004683334196e-05,
"loss": 2.0167,
"step": 830
},
{
"epoch": 0.20883332286234843,
"grad_norm": 0.2237500250339508,
"learning_rate": 9.058671067549056e-05,
"loss": 1.7844,
"step": 831
},
{
"epoch": 0.2090846264999686,
"grad_norm": 0.26887792348861694,
"learning_rate": 9.056334856870522e-05,
"loss": 2.3547,
"step": 832
},
{
"epoch": 0.20933593013758875,
"grad_norm": 0.21619755029678345,
"learning_rate": 9.053996052792244e-05,
"loss": 2.0557,
"step": 833
},
{
"epoch": 0.2095872337752089,
"grad_norm": 0.5002549886703491,
"learning_rate": 9.051654656809521e-05,
"loss": 1.962,
"step": 834
},
{
"epoch": 0.20983853741282904,
"grad_norm": 0.3360225260257721,
"learning_rate": 9.049310670419316e-05,
"loss": 1.9531,
"step": 835
},
{
"epoch": 0.2100898410504492,
"grad_norm": 0.24657025933265686,
"learning_rate": 9.046964095120248e-05,
"loss": 1.2244,
"step": 836
},
{
"epoch": 0.21034114468806936,
"grad_norm": 0.29951533675193787,
"learning_rate": 9.044614932412587e-05,
"loss": 1.9471,
"step": 837
},
{
"epoch": 0.21059244832568952,
"grad_norm": 0.3678789734840393,
"learning_rate": 9.04226318379826e-05,
"loss": 2.3963,
"step": 838
},
{
"epoch": 0.21084375196330968,
"grad_norm": 0.45650580525398254,
"learning_rate": 9.03990885078085e-05,
"loss": 2.342,
"step": 839
},
{
"epoch": 0.21109505560092984,
"grad_norm": 0.4444562792778015,
"learning_rate": 9.037551934865587e-05,
"loss": 1.9851,
"step": 840
},
{
"epoch": 0.21134635923854997,
"grad_norm": 0.2063484787940979,
"learning_rate": 9.035192437559354e-05,
"loss": 2.2532,
"step": 841
},
{
"epoch": 0.21159766287617013,
"grad_norm": 0.3520076274871826,
"learning_rate": 9.032830360370688e-05,
"loss": 1.6042,
"step": 842
},
{
"epoch": 0.2118489665137903,
"grad_norm": 0.4106435477733612,
"learning_rate": 9.03046570480977e-05,
"loss": 2.0197,
"step": 843
},
{
"epoch": 0.21210027015141045,
"grad_norm": 0.37212634086608887,
"learning_rate": 9.028098472388433e-05,
"loss": 2.1224,
"step": 844
},
{
"epoch": 0.2123515737890306,
"grad_norm": 0.36991527676582336,
"learning_rate": 9.025728664620157e-05,
"loss": 2.5759,
"step": 845
},
{
"epoch": 0.21260287742665074,
"grad_norm": 0.3646388649940491,
"learning_rate": 9.023356283020067e-05,
"loss": 2.2962,
"step": 846
},
{
"epoch": 0.2128541810642709,
"grad_norm": 0.21754935383796692,
"learning_rate": 9.020981329104936e-05,
"loss": 1.7198,
"step": 847
},
{
"epoch": 0.21310548470189106,
"grad_norm": 0.24025185406208038,
"learning_rate": 9.01860380439318e-05,
"loss": 1.271,
"step": 848
},
{
"epoch": 0.21335678833951122,
"grad_norm": 0.42848479747772217,
"learning_rate": 9.016223710404856e-05,
"loss": 2.3559,
"step": 849
},
{
"epoch": 0.21360809197713138,
"grad_norm": 0.4237200915813446,
"learning_rate": 9.013841048661673e-05,
"loss": 2.2002,
"step": 850
},
{
"epoch": 0.21385939561475154,
"grad_norm": 0.31660404801368713,
"learning_rate": 9.01145582068697e-05,
"loss": 2.441,
"step": 851
},
{
"epoch": 0.21411069925237167,
"grad_norm": 0.4812658131122589,
"learning_rate": 9.009068028005732e-05,
"loss": 2.2861,
"step": 852
},
{
"epoch": 0.21436200288999183,
"grad_norm": 0.37174031138420105,
"learning_rate": 9.006677672144586e-05,
"loss": 2.4933,
"step": 853
},
{
"epoch": 0.21461330652761199,
"grad_norm": 0.5248540639877319,
"learning_rate": 9.004284754631793e-05,
"loss": 1.9397,
"step": 854
},
{
"epoch": 0.21486461016523214,
"grad_norm": 0.2802974283695221,
"learning_rate": 9.001889276997258e-05,
"loss": 2.2688,
"step": 855
},
{
"epoch": 0.2151159138028523,
"grad_norm": 0.4122345745563507,
"learning_rate": 8.999491240772516e-05,
"loss": 2.1688,
"step": 856
},
{
"epoch": 0.21536721744047246,
"grad_norm": 0.49358898401260376,
"learning_rate": 8.99709064749074e-05,
"loss": 2.0717,
"step": 857
},
{
"epoch": 0.2156185210780926,
"grad_norm": 0.415002703666687,
"learning_rate": 8.994687498686742e-05,
"loss": 2.4572,
"step": 858
},
{
"epoch": 0.21586982471571275,
"grad_norm": 0.3565453290939331,
"learning_rate": 8.992281795896962e-05,
"loss": 2.2275,
"step": 859
},
{
"epoch": 0.21612112835333291,
"grad_norm": 0.25147542357444763,
"learning_rate": 8.989873540659476e-05,
"loss": 1.6368,
"step": 860
},
{
"epoch": 0.21637243199095307,
"grad_norm": 0.330954372882843,
"learning_rate": 8.987462734513993e-05,
"loss": 1.6743,
"step": 861
},
{
"epoch": 0.21662373562857323,
"grad_norm": 0.3177106976509094,
"learning_rate": 8.985049379001849e-05,
"loss": 2.373,
"step": 862
},
{
"epoch": 0.21687503926619336,
"grad_norm": 0.4051658511161804,
"learning_rate": 8.982633475666014e-05,
"loss": 2.504,
"step": 863
},
{
"epoch": 0.21712634290381352,
"grad_norm": 0.1954205185174942,
"learning_rate": 8.980215026051083e-05,
"loss": 1.8698,
"step": 864
},
{
"epoch": 0.21737764654143368,
"grad_norm": 0.31402403116226196,
"learning_rate": 8.977794031703282e-05,
"loss": 2.2363,
"step": 865
},
{
"epoch": 0.21762895017905384,
"grad_norm": 0.28115931153297424,
"learning_rate": 8.975370494170463e-05,
"loss": 2.3547,
"step": 866
},
{
"epoch": 0.217880253816674,
"grad_norm": 0.4603864252567291,
"learning_rate": 8.972944415002105e-05,
"loss": 2.3678,
"step": 867
},
{
"epoch": 0.21813155745429416,
"grad_norm": 0.21994365751743317,
"learning_rate": 8.97051579574931e-05,
"loss": 2.2028,
"step": 868
},
{
"epoch": 0.2183828610919143,
"grad_norm": 0.3067527711391449,
"learning_rate": 8.968084637964804e-05,
"loss": 2.6017,
"step": 869
},
{
"epoch": 0.21863416472953445,
"grad_norm": 0.33018767833709717,
"learning_rate": 8.96565094320294e-05,
"loss": 2.3301,
"step": 870
},
{
"epoch": 0.2188854683671546,
"grad_norm": 0.6793331503868103,
"learning_rate": 8.963214713019687e-05,
"loss": 2.7803,
"step": 871
},
{
"epoch": 0.21913677200477477,
"grad_norm": 0.3478843569755554,
"learning_rate": 8.96077594897264e-05,
"loss": 2.2915,
"step": 872
},
{
"epoch": 0.21938807564239493,
"grad_norm": 0.400673508644104,
"learning_rate": 8.95833465262101e-05,
"loss": 1.2611,
"step": 873
},
{
"epoch": 0.2196393792800151,
"grad_norm": 0.27117395401000977,
"learning_rate": 8.955890825525631e-05,
"loss": 2.6418,
"step": 874
},
{
"epoch": 0.21989068291763522,
"grad_norm": 0.41636621952056885,
"learning_rate": 8.953444469248952e-05,
"loss": 2.0555,
"step": 875
},
{
"epoch": 0.22014198655525538,
"grad_norm": 0.5339227318763733,
"learning_rate": 8.95099558535504e-05,
"loss": 2.4363,
"step": 876
},
{
"epoch": 0.22039329019287554,
"grad_norm": 0.4329914450645447,
"learning_rate": 8.948544175409579e-05,
"loss": 2.198,
"step": 877
},
{
"epoch": 0.2206445938304957,
"grad_norm": 0.377668559551239,
"learning_rate": 8.946090240979865e-05,
"loss": 2.2962,
"step": 878
},
{
"epoch": 0.22089589746811586,
"grad_norm": 0.3951661288738251,
"learning_rate": 8.943633783634813e-05,
"loss": 2.1264,
"step": 879
},
{
"epoch": 0.22114720110573602,
"grad_norm": 0.36469566822052,
"learning_rate": 8.941174804944948e-05,
"loss": 2.6947,
"step": 880
},
{
"epoch": 0.22139850474335615,
"grad_norm": 0.21884319186210632,
"learning_rate": 8.938713306482403e-05,
"loss": 1.9526,
"step": 881
},
{
"epoch": 0.2216498083809763,
"grad_norm": 0.38323378562927246,
"learning_rate": 8.936249289820931e-05,
"loss": 2.1726,
"step": 882
},
{
"epoch": 0.22190111201859647,
"grad_norm": 0.653200089931488,
"learning_rate": 8.933782756535887e-05,
"loss": 2.505,
"step": 883
},
{
"epoch": 0.22215241565621663,
"grad_norm": 0.41666847467422485,
"learning_rate": 8.931313708204239e-05,
"loss": 2.218,
"step": 884
},
{
"epoch": 0.2224037192938368,
"grad_norm": 0.3992173373699188,
"learning_rate": 8.928842146404562e-05,
"loss": 2.4002,
"step": 885
},
{
"epoch": 0.22265502293145692,
"grad_norm": 0.5349919199943542,
"learning_rate": 8.92636807271704e-05,
"loss": 1.5443,
"step": 886
},
{
"epoch": 0.22290632656907708,
"grad_norm": 0.3590314984321594,
"learning_rate": 8.923891488723459e-05,
"loss": 2.3424,
"step": 887
},
{
"epoch": 0.22315763020669724,
"grad_norm": 0.4399639666080475,
"learning_rate": 8.921412396007212e-05,
"loss": 2.3039,
"step": 888
},
{
"epoch": 0.2234089338443174,
"grad_norm": 0.476307213306427,
"learning_rate": 8.918930796153297e-05,
"loss": 2.0807,
"step": 889
},
{
"epoch": 0.22366023748193756,
"grad_norm": 0.40012243390083313,
"learning_rate": 8.916446690748315e-05,
"loss": 2.2535,
"step": 890
},
{
"epoch": 0.22391154111955772,
"grad_norm": 0.4795278012752533,
"learning_rate": 8.913960081380465e-05,
"loss": 1.9693,
"step": 891
},
{
"epoch": 0.22416284475717785,
"grad_norm": 0.1798836588859558,
"learning_rate": 8.911470969639551e-05,
"loss": 0.6201,
"step": 892
},
{
"epoch": 0.224414148394798,
"grad_norm": 0.8967116475105286,
"learning_rate": 8.908979357116976e-05,
"loss": 2.3321,
"step": 893
},
{
"epoch": 0.22466545203241817,
"grad_norm": 0.363643616437912,
"learning_rate": 8.90648524540574e-05,
"loss": 2.2787,
"step": 894
},
{
"epoch": 0.22491675567003833,
"grad_norm": 0.46721211075782776,
"learning_rate": 8.903988636100445e-05,
"loss": 2.05,
"step": 895
},
{
"epoch": 0.22516805930765849,
"grad_norm": 0.41919320821762085,
"learning_rate": 8.901489530797282e-05,
"loss": 2.2821,
"step": 896
},
{
"epoch": 0.22541936294527865,
"grad_norm": 0.29892247915267944,
"learning_rate": 8.898987931094049e-05,
"loss": 2.3336,
"step": 897
},
{
"epoch": 0.22567066658289878,
"grad_norm": 0.41926395893096924,
"learning_rate": 8.896483838590131e-05,
"loss": 2.1726,
"step": 898
},
{
"epoch": 0.22592197022051894,
"grad_norm": 0.3506767451763153,
"learning_rate": 8.893977254886505e-05,
"loss": 1.8011,
"step": 899
},
{
"epoch": 0.2261732738581391,
"grad_norm": 0.39030522108078003,
"learning_rate": 8.891468181585747e-05,
"loss": 1.962,
"step": 900
},
{
"epoch": 0.22642457749575925,
"grad_norm": 0.40221068263053894,
"learning_rate": 8.888956620292022e-05,
"loss": 2.2385,
"step": 901
},
{
"epoch": 0.22667588113337941,
"grad_norm": 0.312210351228714,
"learning_rate": 8.886442572611087e-05,
"loss": 2.3336,
"step": 902
},
{
"epoch": 0.22692718477099957,
"grad_norm": 0.1972939372062683,
"learning_rate": 8.883926040150283e-05,
"loss": 1.8672,
"step": 903
},
{
"epoch": 0.2271784884086197,
"grad_norm": 0.30837222933769226,
"learning_rate": 8.881407024518548e-05,
"loss": 2.1923,
"step": 904
},
{
"epoch": 0.22742979204623986,
"grad_norm": 0.32954657077789307,
"learning_rate": 8.8788855273264e-05,
"loss": 2.53,
"step": 905
},
{
"epoch": 0.22768109568386002,
"grad_norm": 0.4983203113079071,
"learning_rate": 8.87636155018595e-05,
"loss": 1.7686,
"step": 906
},
{
"epoch": 0.22793239932148018,
"grad_norm": 0.29157817363739014,
"learning_rate": 8.873835094710891e-05,
"loss": 2.0444,
"step": 907
},
{
"epoch": 0.22818370295910034,
"grad_norm": 0.7063765525817871,
"learning_rate": 8.8713061625165e-05,
"loss": 2.3444,
"step": 908
},
{
"epoch": 0.22843500659672047,
"grad_norm": 0.2752906382083893,
"learning_rate": 8.868774755219641e-05,
"loss": 2.2363,
"step": 909
},
{
"epoch": 0.22868631023434063,
"grad_norm": 0.4020974338054657,
"learning_rate": 8.866240874438755e-05,
"loss": 2.2792,
"step": 910
},
{
"epoch": 0.2289376138719608,
"grad_norm": 0.3203171193599701,
"learning_rate": 8.863704521793869e-05,
"loss": 2.1895,
"step": 911
},
{
"epoch": 0.22918891750958095,
"grad_norm": 0.7605288028717041,
"learning_rate": 8.861165698906589e-05,
"loss": 2.2526,
"step": 912
},
{
"epoch": 0.2294402211472011,
"grad_norm": 0.4583891034126282,
"learning_rate": 8.8586244074001e-05,
"loss": 2.5682,
"step": 913
},
{
"epoch": 0.22969152478482127,
"grad_norm": 0.29976171255111694,
"learning_rate": 8.856080648899163e-05,
"loss": 1.8839,
"step": 914
},
{
"epoch": 0.2299428284224414,
"grad_norm": 0.4007084369659424,
"learning_rate": 8.853534425030123e-05,
"loss": 2.2063,
"step": 915
},
{
"epoch": 0.23019413206006156,
"grad_norm": 0.37610113620758057,
"learning_rate": 8.850985737420896e-05,
"loss": 2.5636,
"step": 916
},
{
"epoch": 0.23044543569768172,
"grad_norm": 0.3937523663043976,
"learning_rate": 8.84843458770097e-05,
"loss": 2.2865,
"step": 917
},
{
"epoch": 0.23069673933530188,
"grad_norm": 0.44338393211364746,
"learning_rate": 8.845880977501419e-05,
"loss": 2.2287,
"step": 918
},
{
"epoch": 0.23094804297292204,
"grad_norm": 0.4235187768936157,
"learning_rate": 8.843324908454875e-05,
"loss": 2.3839,
"step": 919
},
{
"epoch": 0.2311993466105422,
"grad_norm": 0.5745797157287598,
"learning_rate": 8.840766382195553e-05,
"loss": 1.9735,
"step": 920
},
{
"epoch": 0.23145065024816233,
"grad_norm": 0.35707423090934753,
"learning_rate": 8.838205400359234e-05,
"loss": 1.8084,
"step": 921
},
{
"epoch": 0.2317019538857825,
"grad_norm": 0.323047935962677,
"learning_rate": 8.835641964583272e-05,
"loss": 2.422,
"step": 922
},
{
"epoch": 0.23195325752340265,
"grad_norm": 0.4362463355064392,
"learning_rate": 8.833076076506588e-05,
"loss": 2.5153,
"step": 923
},
{
"epoch": 0.2322045611610228,
"grad_norm": 0.3632015883922577,
"learning_rate": 8.830507737769669e-05,
"loss": 2.7776,
"step": 924
},
{
"epoch": 0.23245586479864297,
"grad_norm": 0.48653021454811096,
"learning_rate": 8.827936950014573e-05,
"loss": 2.3679,
"step": 925
},
{
"epoch": 0.23270716843626313,
"grad_norm": 0.37184858322143555,
"learning_rate": 8.825363714884922e-05,
"loss": 1.9735,
"step": 926
},
{
"epoch": 0.23295847207388326,
"grad_norm": 0.38167500495910645,
"learning_rate": 8.822788034025903e-05,
"loss": 2.8957,
"step": 927
},
{
"epoch": 0.23320977571150342,
"grad_norm": 0.4858744144439697,
"learning_rate": 8.820209909084265e-05,
"loss": 2.4265,
"step": 928
},
{
"epoch": 0.23346107934912358,
"grad_norm": 0.4078523516654968,
"learning_rate": 8.81762934170832e-05,
"loss": 2.0574,
"step": 929
},
{
"epoch": 0.23371238298674374,
"grad_norm": 0.3559398353099823,
"learning_rate": 8.815046333547943e-05,
"loss": 2.2669,
"step": 930
},
{
"epoch": 0.2339636866243639,
"grad_norm": 0.4299301207065582,
"learning_rate": 8.81246088625457e-05,
"loss": 2.1053,
"step": 931
},
{
"epoch": 0.23421499026198403,
"grad_norm": 0.40756756067276,
"learning_rate": 8.809873001481193e-05,
"loss": 2.3635,
"step": 932
},
{
"epoch": 0.2344662938996042,
"grad_norm": 0.37873560190200806,
"learning_rate": 8.807282680882367e-05,
"loss": 1.0149,
"step": 933
},
{
"epoch": 0.23471759753722435,
"grad_norm": 0.40375521779060364,
"learning_rate": 8.8046899261142e-05,
"loss": 2.5615,
"step": 934
},
{
"epoch": 0.2349689011748445,
"grad_norm": 0.4414771497249603,
"learning_rate": 8.802094738834361e-05,
"loss": 2.513,
"step": 935
},
{
"epoch": 0.23522020481246467,
"grad_norm": 0.40930548310279846,
"learning_rate": 8.799497120702069e-05,
"loss": 1.5781,
"step": 936
},
{
"epoch": 0.23547150845008483,
"grad_norm": 0.4570627808570862,
"learning_rate": 8.7968970733781e-05,
"loss": 2.2293,
"step": 937
},
{
"epoch": 0.23572281208770496,
"grad_norm": 0.38974353671073914,
"learning_rate": 8.794294598524784e-05,
"loss": 1.959,
"step": 938
},
{
"epoch": 0.23597411572532512,
"grad_norm": 0.5479612946510315,
"learning_rate": 8.791689697806e-05,
"loss": 2.0345,
"step": 939
},
{
"epoch": 0.23622541936294528,
"grad_norm": 0.35705479979515076,
"learning_rate": 8.789082372887183e-05,
"loss": 2.2542,
"step": 940
},
{
"epoch": 0.23647672300056544,
"grad_norm": 0.4570086896419525,
"learning_rate": 8.786472625435311e-05,
"loss": 2.4306,
"step": 941
},
{
"epoch": 0.2367280266381856,
"grad_norm": 1.2224934101104736,
"learning_rate": 8.783860457118918e-05,
"loss": 2.031,
"step": 942
},
{
"epoch": 0.23697933027580576,
"grad_norm": 0.47019338607788086,
"learning_rate": 8.781245869608077e-05,
"loss": 2.5845,
"step": 943
},
{
"epoch": 0.2372306339134259,
"grad_norm": 0.4843159019947052,
"learning_rate": 8.778628864574419e-05,
"loss": 2.2976,
"step": 944
},
{
"epoch": 0.23748193755104605,
"grad_norm": 0.44156232476234436,
"learning_rate": 8.776009443691109e-05,
"loss": 1.7753,
"step": 945
},
{
"epoch": 0.2377332411886662,
"grad_norm": 0.4849649667739868,
"learning_rate": 8.773387608632867e-05,
"loss": 2.4643,
"step": 946
},
{
"epoch": 0.23798454482628636,
"grad_norm": 0.47490194439888,
"learning_rate": 8.770763361075949e-05,
"loss": 1.9425,
"step": 947
},
{
"epoch": 0.23823584846390652,
"grad_norm": 0.42135703563690186,
"learning_rate": 8.768136702698158e-05,
"loss": 2.2653,
"step": 948
},
{
"epoch": 0.23848715210152666,
"grad_norm": 0.3938981294631958,
"learning_rate": 8.765507635178832e-05,
"loss": 1.8336,
"step": 949
},
{
"epoch": 0.23873845573914682,
"grad_norm": 0.3590041399002075,
"learning_rate": 8.762876160198858e-05,
"loss": 2.083,
"step": 950
},
{
"epoch": 0.23898975937676697,
"grad_norm": 10.042701721191406,
"learning_rate": 8.760242279440657e-05,
"loss": 1.9433,
"step": 951
},
{
"epoch": 0.23924106301438713,
"grad_norm": 0.4117979407310486,
"learning_rate": 8.75760599458819e-05,
"loss": 1.9981,
"step": 952
},
{
"epoch": 0.2394923666520073,
"grad_norm": 0.3976224660873413,
"learning_rate": 8.754967307326951e-05,
"loss": 1.9221,
"step": 953
},
{
"epoch": 0.23974367028962745,
"grad_norm": 0.4007420241832733,
"learning_rate": 8.752326219343977e-05,
"loss": 2.1583,
"step": 954
},
{
"epoch": 0.23999497392724758,
"grad_norm": 0.43828290700912476,
"learning_rate": 8.74968273232783e-05,
"loss": 2.0958,
"step": 955
},
{
"epoch": 0.24024627756486774,
"grad_norm": 0.3099430501461029,
"learning_rate": 8.747036847968618e-05,
"loss": 2.0985,
"step": 956
},
{
"epoch": 0.2404975812024879,
"grad_norm": 0.2864762842655182,
"learning_rate": 8.744388567957971e-05,
"loss": 1.9034,
"step": 957
},
{
"epoch": 0.24074888484010806,
"grad_norm": 0.38019657135009766,
"learning_rate": 8.741737893989058e-05,
"loss": 2.2289,
"step": 958
},
{
"epoch": 0.24100018847772822,
"grad_norm": 0.506572425365448,
"learning_rate": 8.739084827756575e-05,
"loss": 2.3025,
"step": 959
},
{
"epoch": 0.24125149211534838,
"grad_norm": 0.4378896653652191,
"learning_rate": 8.736429370956746e-05,
"loss": 1.9396,
"step": 960
},
{
"epoch": 0.2415027957529685,
"grad_norm": 0.36668267846107483,
"learning_rate": 8.733771525287331e-05,
"loss": 2.7244,
"step": 961
},
{
"epoch": 0.24175409939058867,
"grad_norm": 0.3157023787498474,
"learning_rate": 8.731111292447605e-05,
"loss": 2.2407,
"step": 962
},
{
"epoch": 0.24200540302820883,
"grad_norm": 0.2563331127166748,
"learning_rate": 8.728448674138381e-05,
"loss": 2.4275,
"step": 963
},
{
"epoch": 0.242256706665829,
"grad_norm": 0.49870792031288147,
"learning_rate": 8.72578367206199e-05,
"loss": 2.0975,
"step": 964
},
{
"epoch": 0.24250801030344915,
"grad_norm": 0.328143835067749,
"learning_rate": 8.723116287922288e-05,
"loss": 2.0371,
"step": 965
},
{
"epoch": 0.2427593139410693,
"grad_norm": 0.40886473655700684,
"learning_rate": 8.72044652342466e-05,
"loss": 2.294,
"step": 966
},
{
"epoch": 0.24301061757868944,
"grad_norm": 0.39602747559547424,
"learning_rate": 8.717774380276002e-05,
"loss": 2.067,
"step": 967
},
{
"epoch": 0.2432619212163096,
"grad_norm": 0.33269646763801575,
"learning_rate": 8.715099860184743e-05,
"loss": 2.2003,
"step": 968
},
{
"epoch": 0.24351322485392976,
"grad_norm": 0.4260612726211548,
"learning_rate": 8.712422964860822e-05,
"loss": 2.4808,
"step": 969
},
{
"epoch": 0.24376452849154992,
"grad_norm": 0.3412139415740967,
"learning_rate": 8.7097436960157e-05,
"loss": 2.3457,
"step": 970
},
{
"epoch": 0.24401583212917008,
"grad_norm": 0.402170866727829,
"learning_rate": 8.707062055362359e-05,
"loss": 2.4638,
"step": 971
},
{
"epoch": 0.2442671357667902,
"grad_norm": 0.2597676217556,
"learning_rate": 8.70437804461529e-05,
"loss": 1.7648,
"step": 972
},
{
"epoch": 0.24451843940441037,
"grad_norm": 0.424844354391098,
"learning_rate": 8.701691665490504e-05,
"loss": 2.276,
"step": 973
},
{
"epoch": 0.24476974304203053,
"grad_norm": 0.1249924823641777,
"learning_rate": 8.699002919705527e-05,
"loss": 0.5754,
"step": 974
},
{
"epoch": 0.2450210466796507,
"grad_norm": 0.14425934851169586,
"learning_rate": 8.696311808979397e-05,
"loss": 0.9444,
"step": 975
},
{
"epoch": 0.24527235031727085,
"grad_norm": 0.5581234693527222,
"learning_rate": 8.693618335032663e-05,
"loss": 2.2831,
"step": 976
},
{
"epoch": 0.245523653954891,
"grad_norm": 0.17069493234157562,
"learning_rate": 8.690922499587387e-05,
"loss": 1.0646,
"step": 977
},
{
"epoch": 0.24577495759251114,
"grad_norm": 0.3384755551815033,
"learning_rate": 8.688224304367137e-05,
"loss": 1.8558,
"step": 978
},
{
"epoch": 0.2460262612301313,
"grad_norm": 0.3919682204723358,
"learning_rate": 8.685523751096994e-05,
"loss": 2.5674,
"step": 979
},
{
"epoch": 0.24627756486775146,
"grad_norm": 0.25237560272216797,
"learning_rate": 8.682820841503542e-05,
"loss": 1.7601,
"step": 980
},
{
"epoch": 0.24652886850537162,
"grad_norm": 0.3378541171550751,
"learning_rate": 8.680115577314877e-05,
"loss": 2.5141,
"step": 981
},
{
"epoch": 0.24678017214299178,
"grad_norm": 0.4153686761856079,
"learning_rate": 8.6774079602606e-05,
"loss": 1.1121,
"step": 982
},
{
"epoch": 0.24703147578061194,
"grad_norm": 0.3883324861526489,
"learning_rate": 8.67469799207181e-05,
"loss": 1.8358,
"step": 983
},
{
"epoch": 0.24728277941823207,
"grad_norm": 0.35025763511657715,
"learning_rate": 8.671985674481113e-05,
"loss": 1.7407,
"step": 984
},
{
"epoch": 0.24753408305585223,
"grad_norm": 0.1769835203886032,
"learning_rate": 8.66927100922262e-05,
"loss": 1.1858,
"step": 985
},
{
"epoch": 0.2477853866934724,
"grad_norm": 0.4593118131160736,
"learning_rate": 8.66655399803194e-05,
"loss": 2.4193,
"step": 986
},
{
"epoch": 0.24803669033109255,
"grad_norm": 0.24635930359363556,
"learning_rate": 8.663834642646178e-05,
"loss": 2.3587,
"step": 987
},
{
"epoch": 0.2482879939687127,
"grad_norm": 0.3709847927093506,
"learning_rate": 8.661112944803946e-05,
"loss": 2.5285,
"step": 988
},
{
"epoch": 0.24853929760633287,
"grad_norm": 0.3426840603351593,
"learning_rate": 8.65838890624535e-05,
"loss": 1.9006,
"step": 989
},
{
"epoch": 0.248790601243953,
"grad_norm": 0.29686790704727173,
"learning_rate": 8.655662528711987e-05,
"loss": 1.9789,
"step": 990
},
{
"epoch": 0.24904190488157316,
"grad_norm": 0.15335550904273987,
"learning_rate": 8.65293381394696e-05,
"loss": 1.379,
"step": 991
},
{
"epoch": 0.24929320851919332,
"grad_norm": 0.31200841069221497,
"learning_rate": 8.650202763694856e-05,
"loss": 2.6586,
"step": 992
},
{
"epoch": 0.24954451215681347,
"grad_norm": 0.28087717294692993,
"learning_rate": 8.647469379701765e-05,
"loss": 2.1206,
"step": 993
},
{
"epoch": 0.24979581579443363,
"grad_norm": 0.4623563289642334,
"learning_rate": 8.64473366371526e-05,
"loss": 2.3116,
"step": 994
},
{
"epoch": 0.25004711943205377,
"grad_norm": 0.3620995879173279,
"learning_rate": 8.641995617484411e-05,
"loss": 2.1746,
"step": 995
},
{
"epoch": 0.2502984230696739,
"grad_norm": 0.42614445090293884,
"learning_rate": 8.639255242759773e-05,
"loss": 2.1065,
"step": 996
},
{
"epoch": 0.2505497267072941,
"grad_norm": 0.5489148497581482,
"learning_rate": 8.636512541293396e-05,
"loss": 1.6889,
"step": 997
},
{
"epoch": 0.25080103034491424,
"grad_norm": 0.5294693112373352,
"learning_rate": 8.633767514838811e-05,
"loss": 1.9634,
"step": 998
},
{
"epoch": 0.2510523339825344,
"grad_norm": 0.2602441906929016,
"learning_rate": 8.631020165151041e-05,
"loss": 2.0264,
"step": 999
},
{
"epoch": 0.25130363762015456,
"grad_norm": 0.5113153457641602,
"learning_rate": 8.62827049398659e-05,
"loss": 2.47,
"step": 1000
},
{
"epoch": 0.2515549412577747,
"grad_norm": 0.21645587682724,
"learning_rate": 8.62551850310345e-05,
"loss": 1.525,
"step": 1001
},
{
"epoch": 0.2518062448953949,
"grad_norm": 0.48258715867996216,
"learning_rate": 8.622764194261093e-05,
"loss": 2.5367,
"step": 1002
},
{
"epoch": 0.25205754853301504,
"grad_norm": 0.4978300929069519,
"learning_rate": 8.620007569220472e-05,
"loss": 2.3648,
"step": 1003
},
{
"epoch": 0.25230885217063515,
"grad_norm": 0.3442172110080719,
"learning_rate": 8.617248629744028e-05,
"loss": 2.1073,
"step": 1004
},
{
"epoch": 0.2525601558082553,
"grad_norm": 0.2076217383146286,
"learning_rate": 8.614487377595672e-05,
"loss": 1.2976,
"step": 1005
},
{
"epoch": 0.25281145944587546,
"grad_norm": 0.41670069098472595,
"learning_rate": 8.611723814540802e-05,
"loss": 2.4169,
"step": 1006
},
{
"epoch": 0.2530627630834956,
"grad_norm": 0.1605192869901657,
"learning_rate": 8.608957942346287e-05,
"loss": 1.2566,
"step": 1007
},
{
"epoch": 0.2533140667211158,
"grad_norm": 0.3367489278316498,
"learning_rate": 8.606189762780478e-05,
"loss": 2.5501,
"step": 1008
},
{
"epoch": 0.25356537035873594,
"grad_norm": 0.5993397235870361,
"learning_rate": 8.603419277613195e-05,
"loss": 2.5038,
"step": 1009
},
{
"epoch": 0.2538166739963561,
"grad_norm": 0.4803079068660736,
"learning_rate": 8.600646488615738e-05,
"loss": 2.5279,
"step": 1010
},
{
"epoch": 0.25406797763397626,
"grad_norm": 0.22795291244983673,
"learning_rate": 8.597871397560877e-05,
"loss": 2.4772,
"step": 1011
},
{
"epoch": 0.2543192812715964,
"grad_norm": 0.43239662051200867,
"learning_rate": 8.595094006222853e-05,
"loss": 2.4182,
"step": 1012
},
{
"epoch": 0.2545705849092166,
"grad_norm": 0.5313870310783386,
"learning_rate": 8.592314316377379e-05,
"loss": 1.6524,
"step": 1013
},
{
"epoch": 0.25482188854683674,
"grad_norm": 0.30473679304122925,
"learning_rate": 8.589532329801639e-05,
"loss": 1.8479,
"step": 1014
},
{
"epoch": 0.25507319218445684,
"grad_norm": 0.5765677690505981,
"learning_rate": 8.586748048274281e-05,
"loss": 2.4291,
"step": 1015
},
{
"epoch": 0.255324495822077,
"grad_norm": 0.5507373809814453,
"learning_rate": 8.583961473575426e-05,
"loss": 1.8522,
"step": 1016
},
{
"epoch": 0.25557579945969716,
"grad_norm": 0.2905975878238678,
"learning_rate": 8.581172607486658e-05,
"loss": 2.0781,
"step": 1017
},
{
"epoch": 0.2558271030973173,
"grad_norm": 0.27408596873283386,
"learning_rate": 8.578381451791025e-05,
"loss": 2.203,
"step": 1018
},
{
"epoch": 0.2560784067349375,
"grad_norm": 0.3338567316532135,
"learning_rate": 8.575588008273039e-05,
"loss": 2.1768,
"step": 1019
},
{
"epoch": 0.25632971037255764,
"grad_norm": 0.44413185119628906,
"learning_rate": 8.572792278718678e-05,
"loss": 2.2131,
"step": 1020
},
{
"epoch": 0.2565810140101778,
"grad_norm": 0.3354492783546448,
"learning_rate": 8.569994264915378e-05,
"loss": 2.367,
"step": 1021
},
{
"epoch": 0.25683231764779796,
"grad_norm": 0.4711921513080597,
"learning_rate": 8.567193968652037e-05,
"loss": 2.0846,
"step": 1022
},
{
"epoch": 0.2570836212854181,
"grad_norm": 0.4673987925052643,
"learning_rate": 8.564391391719013e-05,
"loss": 2.2828,
"step": 1023
},
{
"epoch": 0.2573349249230383,
"grad_norm": 0.3377786874771118,
"learning_rate": 8.56158653590812e-05,
"loss": 1.8866,
"step": 1024
},
{
"epoch": 0.25758622856065844,
"grad_norm": 0.3204606771469116,
"learning_rate": 8.558779403012631e-05,
"loss": 2.3215,
"step": 1025
},
{
"epoch": 0.2578375321982786,
"grad_norm": 0.2833389341831207,
"learning_rate": 8.555969994827272e-05,
"loss": 1.8964,
"step": 1026
},
{
"epoch": 0.2580888358358987,
"grad_norm": 0.33112481236457825,
"learning_rate": 8.553158313148229e-05,
"loss": 2.3466,
"step": 1027
},
{
"epoch": 0.25834013947351886,
"grad_norm": 0.3505350351333618,
"learning_rate": 8.550344359773135e-05,
"loss": 2.2383,
"step": 1028
},
{
"epoch": 0.258591443111139,
"grad_norm": 0.39331895112991333,
"learning_rate": 8.547528136501079e-05,
"loss": 1.9421,
"step": 1029
},
{
"epoch": 0.2588427467487592,
"grad_norm": 0.40246421098709106,
"learning_rate": 8.544709645132605e-05,
"loss": 2.3665,
"step": 1030
},
{
"epoch": 0.25909405038637934,
"grad_norm": 0.3660884499549866,
"learning_rate": 8.541888887469698e-05,
"loss": 2.3929,
"step": 1031
},
{
"epoch": 0.2593453540239995,
"grad_norm": 0.47951841354370117,
"learning_rate": 8.5390658653158e-05,
"loss": 2.4356,
"step": 1032
},
{
"epoch": 0.25959665766161966,
"grad_norm": 0.42761266231536865,
"learning_rate": 8.536240580475795e-05,
"loss": 2.3366,
"step": 1033
},
{
"epoch": 0.2598479612992398,
"grad_norm": 0.3438764214515686,
"learning_rate": 8.53341303475602e-05,
"loss": 2.2531,
"step": 1034
},
{
"epoch": 0.26009926493686,
"grad_norm": 0.6126356720924377,
"learning_rate": 8.530583229964249e-05,
"loss": 2.0537,
"step": 1035
},
{
"epoch": 0.26035056857448013,
"grad_norm": 0.24006953835487366,
"learning_rate": 8.527751167909711e-05,
"loss": 2.0906,
"step": 1036
},
{
"epoch": 0.2606018722121003,
"grad_norm": 0.5417637825012207,
"learning_rate": 8.524916850403068e-05,
"loss": 2.1112,
"step": 1037
},
{
"epoch": 0.2608531758497204,
"grad_norm": 0.2629411220550537,
"learning_rate": 8.52208027925643e-05,
"loss": 1.0985,
"step": 1038
},
{
"epoch": 0.26110447948734056,
"grad_norm": 0.36217573285102844,
"learning_rate": 8.519241456283345e-05,
"loss": 1.8339,
"step": 1039
},
{
"epoch": 0.2613557831249607,
"grad_norm": 0.398267537355423,
"learning_rate": 8.516400383298803e-05,
"loss": 2.3536,
"step": 1040
},
{
"epoch": 0.2616070867625809,
"grad_norm": 0.4537069499492645,
"learning_rate": 8.513557062119232e-05,
"loss": 1.8612,
"step": 1041
},
{
"epoch": 0.26185839040020104,
"grad_norm": 0.39333221316337585,
"learning_rate": 8.510711494562496e-05,
"loss": 2.1242,
"step": 1042
},
{
"epoch": 0.2621096940378212,
"grad_norm": 0.3150976300239563,
"learning_rate": 8.507863682447896e-05,
"loss": 2.1419,
"step": 1043
},
{
"epoch": 0.26236099767544135,
"grad_norm": 0.33974120020866394,
"learning_rate": 8.505013627596167e-05,
"loss": 2.7756,
"step": 1044
},
{
"epoch": 0.2626123013130615,
"grad_norm": 0.23866431415081024,
"learning_rate": 8.502161331829483e-05,
"loss": 2.24,
"step": 1045
},
{
"epoch": 0.2628636049506817,
"grad_norm": 0.3774257004261017,
"learning_rate": 8.499306796971443e-05,
"loss": 2.6075,
"step": 1046
},
{
"epoch": 0.26311490858830183,
"grad_norm": 0.3210359215736389,
"learning_rate": 8.49645002484708e-05,
"loss": 2.1907,
"step": 1047
},
{
"epoch": 0.263366212225922,
"grad_norm": 0.5776761174201965,
"learning_rate": 8.493591017282862e-05,
"loss": 1.9049,
"step": 1048
},
{
"epoch": 0.26361751586354215,
"grad_norm": 0.3872734606266022,
"learning_rate": 8.490729776106681e-05,
"loss": 1.8222,
"step": 1049
},
{
"epoch": 0.26386881950116226,
"grad_norm": 0.36327892541885376,
"learning_rate": 8.48786630314786e-05,
"loss": 2.2852,
"step": 1050
},
{
"epoch": 0.2641201231387824,
"grad_norm": 0.5938968658447266,
"learning_rate": 8.485000600237147e-05,
"loss": 2.2302,
"step": 1051
},
{
"epoch": 0.2643714267764026,
"grad_norm": 0.23401345312595367,
"learning_rate": 8.482132669206718e-05,
"loss": 2.2253,
"step": 1052
},
{
"epoch": 0.26462273041402273,
"grad_norm": 0.36279526352882385,
"learning_rate": 8.479262511890168e-05,
"loss": 2.5141,
"step": 1053
},
{
"epoch": 0.2648740340516429,
"grad_norm": 0.362393319606781,
"learning_rate": 8.476390130122524e-05,
"loss": 2.4776,
"step": 1054
},
{
"epoch": 0.26512533768926305,
"grad_norm": 0.42770296335220337,
"learning_rate": 8.473515525740228e-05,
"loss": 2.5201,
"step": 1055
},
{
"epoch": 0.2653766413268832,
"grad_norm": 0.3319082260131836,
"learning_rate": 8.470638700581146e-05,
"loss": 2.2149,
"step": 1056
},
{
"epoch": 0.26562794496450337,
"grad_norm": 0.6831150054931641,
"learning_rate": 8.467759656484563e-05,
"loss": 2.0079,
"step": 1057
},
{
"epoch": 0.26587924860212353,
"grad_norm": 0.3941812813282013,
"learning_rate": 8.464878395291186e-05,
"loss": 2.2493,
"step": 1058
},
{
"epoch": 0.2661305522397437,
"grad_norm": 0.5003316402435303,
"learning_rate": 8.461994918843133e-05,
"loss": 2.0187,
"step": 1059
},
{
"epoch": 0.26638185587736385,
"grad_norm": 0.7166873216629028,
"learning_rate": 8.459109228983943e-05,
"loss": 2.4745,
"step": 1060
},
{
"epoch": 0.26663315951498395,
"grad_norm": 0.5578575134277344,
"learning_rate": 8.456221327558572e-05,
"loss": 2.4646,
"step": 1061
},
{
"epoch": 0.2668844631526041,
"grad_norm": 1.8631904125213623,
"learning_rate": 8.45333121641338e-05,
"loss": 2.7934,
"step": 1062
},
{
"epoch": 0.26713576679022427,
"grad_norm": 0.30865031480789185,
"learning_rate": 8.450438897396154e-05,
"loss": 1.8842,
"step": 1063
},
{
"epoch": 0.26738707042784443,
"grad_norm": 0.3725493848323822,
"learning_rate": 8.447544372356082e-05,
"loss": 1.9359,
"step": 1064
},
{
"epoch": 0.2676383740654646,
"grad_norm": 0.5875294804573059,
"learning_rate": 8.444647643143766e-05,
"loss": 2.611,
"step": 1065
},
{
"epoch": 0.26788967770308475,
"grad_norm": 0.20553795993328094,
"learning_rate": 8.441748711611216e-05,
"loss": 2.3654,
"step": 1066
},
{
"epoch": 0.2681409813407049,
"grad_norm": 0.814967691898346,
"learning_rate": 8.438847579611855e-05,
"loss": 1.9915,
"step": 1067
},
{
"epoch": 0.26839228497832507,
"grad_norm": 0.19453756511211395,
"learning_rate": 8.435944249000504e-05,
"loss": 1.5404,
"step": 1068
},
{
"epoch": 0.26864358861594523,
"grad_norm": 0.2923291027545929,
"learning_rate": 8.433038721633399e-05,
"loss": 2.6746,
"step": 1069
},
{
"epoch": 0.2688948922535654,
"grad_norm": 0.4880603551864624,
"learning_rate": 8.430130999368173e-05,
"loss": 2.8785,
"step": 1070
},
{
"epoch": 0.26914619589118555,
"grad_norm": 0.3680141270160675,
"learning_rate": 8.427221084063866e-05,
"loss": 2.2196,
"step": 1071
},
{
"epoch": 0.2693974995288057,
"grad_norm": 0.3895852863788605,
"learning_rate": 8.42430897758092e-05,
"loss": 2.1979,
"step": 1072
},
{
"epoch": 0.2696488031664258,
"grad_norm": 0.1814226508140564,
"learning_rate": 8.421394681781176e-05,
"loss": 0.8641,
"step": 1073
},
{
"epoch": 0.26990010680404597,
"grad_norm": 0.5926142334938049,
"learning_rate": 8.418478198527878e-05,
"loss": 2.3769,
"step": 1074
},
{
"epoch": 0.27015141044166613,
"grad_norm": 0.47387734055519104,
"learning_rate": 8.415559529685666e-05,
"loss": 2.4868,
"step": 1075
},
{
"epoch": 0.2704027140792863,
"grad_norm": 0.33305391669273376,
"learning_rate": 8.412638677120577e-05,
"loss": 1.9719,
"step": 1076
},
{
"epoch": 0.27065401771690645,
"grad_norm": 0.40535521507263184,
"learning_rate": 8.409715642700048e-05,
"loss": 2.0834,
"step": 1077
},
{
"epoch": 0.2709053213545266,
"grad_norm": 0.2667929530143738,
"learning_rate": 8.406790428292906e-05,
"loss": 2.2652,
"step": 1078
},
{
"epoch": 0.27115662499214677,
"grad_norm": 0.35209354758262634,
"learning_rate": 8.403863035769372e-05,
"loss": 1.9958,
"step": 1079
},
{
"epoch": 0.2714079286297669,
"grad_norm": 0.4001673758029938,
"learning_rate": 8.400933467001066e-05,
"loss": 2.2901,
"step": 1080
},
{
"epoch": 0.2716592322673871,
"grad_norm": 0.397296667098999,
"learning_rate": 8.39800172386099e-05,
"loss": 2.3905,
"step": 1081
},
{
"epoch": 0.27191053590500724,
"grad_norm": 0.40791282057762146,
"learning_rate": 8.395067808223547e-05,
"loss": 2.0415,
"step": 1082
},
{
"epoch": 0.2721618395426274,
"grad_norm": 0.42828860878944397,
"learning_rate": 8.392131721964516e-05,
"loss": 2.4725,
"step": 1083
},
{
"epoch": 0.2724131431802475,
"grad_norm": 0.2626012861728668,
"learning_rate": 8.389193466961076e-05,
"loss": 2.4184,
"step": 1084
},
{
"epoch": 0.27266444681786767,
"grad_norm": 0.5925227999687195,
"learning_rate": 8.386253045091785e-05,
"loss": 2.4812,
"step": 1085
},
{
"epoch": 0.2729157504554878,
"grad_norm": 0.43262800574302673,
"learning_rate": 8.383310458236589e-05,
"loss": 2.0934,
"step": 1086
},
{
"epoch": 0.273167054093108,
"grad_norm": 0.3726051151752472,
"learning_rate": 8.380365708276818e-05,
"loss": 2.1072,
"step": 1087
},
{
"epoch": 0.27341835773072815,
"grad_norm": 0.2803446352481842,
"learning_rate": 8.377418797095186e-05,
"loss": 1.6965,
"step": 1088
},
{
"epoch": 0.2736696613683483,
"grad_norm": 0.4271281361579895,
"learning_rate": 8.374469726575786e-05,
"loss": 2.499,
"step": 1089
},
{
"epoch": 0.27392096500596846,
"grad_norm": 0.4066520929336548,
"learning_rate": 8.371518498604094e-05,
"loss": 2.5189,
"step": 1090
},
{
"epoch": 0.2741722686435886,
"grad_norm": 0.6859603524208069,
"learning_rate": 8.368565115066963e-05,
"loss": 2.4845,
"step": 1091
},
{
"epoch": 0.2744235722812088,
"grad_norm": 0.5193429589271545,
"learning_rate": 8.365609577852629e-05,
"loss": 2.2581,
"step": 1092
},
{
"epoch": 0.27467487591882894,
"grad_norm": 0.2996985912322998,
"learning_rate": 8.362651888850699e-05,
"loss": 2.7202,
"step": 1093
},
{
"epoch": 0.2749261795564491,
"grad_norm": 0.4352916181087494,
"learning_rate": 8.359692049952158e-05,
"loss": 1.9127,
"step": 1094
},
{
"epoch": 0.27517748319406926,
"grad_norm": 0.5546665191650391,
"learning_rate": 8.356730063049368e-05,
"loss": 1.5597,
"step": 1095
},
{
"epoch": 0.27542878683168937,
"grad_norm": 0.45468562841415405,
"learning_rate": 8.35376593003606e-05,
"loss": 2.2546,
"step": 1096
},
{
"epoch": 0.2756800904693095,
"grad_norm": 0.5346075892448425,
"learning_rate": 8.350799652807342e-05,
"loss": 2.1757,
"step": 1097
},
{
"epoch": 0.2759313941069297,
"grad_norm": 0.4020247161388397,
"learning_rate": 8.347831233259688e-05,
"loss": 2.5377,
"step": 1098
},
{
"epoch": 0.27618269774454984,
"grad_norm": 0.44145265221595764,
"learning_rate": 8.344860673290946e-05,
"loss": 2.526,
"step": 1099
},
{
"epoch": 0.27643400138217,
"grad_norm": 0.3393920361995697,
"learning_rate": 8.341887974800328e-05,
"loss": 2.4143,
"step": 1100
},
{
"epoch": 0.27668530501979016,
"grad_norm": 3.1971209049224854,
"learning_rate": 8.338913139688416e-05,
"loss": 2.1105,
"step": 1101
},
{
"epoch": 0.2769366086574103,
"grad_norm": 0.28271710872650146,
"learning_rate": 8.335936169857159e-05,
"loss": 1.4886,
"step": 1102
},
{
"epoch": 0.2771879122950305,
"grad_norm": 0.4334240257740021,
"learning_rate": 8.332957067209869e-05,
"loss": 2.3919,
"step": 1103
},
{
"epoch": 0.27743921593265064,
"grad_norm": 0.31101804971694946,
"learning_rate": 8.329975833651221e-05,
"loss": 2.5608,
"step": 1104
},
{
"epoch": 0.2776905195702708,
"grad_norm": 0.5559032559394836,
"learning_rate": 8.326992471087256e-05,
"loss": 2.0525,
"step": 1105
},
{
"epoch": 0.27794182320789096,
"grad_norm": 0.2627129852771759,
"learning_rate": 8.324006981425371e-05,
"loss": 2.4173,
"step": 1106
},
{
"epoch": 0.27819312684551106,
"grad_norm": 0.5957119464874268,
"learning_rate": 8.321019366574326e-05,
"loss": 2.3493,
"step": 1107
},
{
"epoch": 0.2784444304831312,
"grad_norm": 0.4542389512062073,
"learning_rate": 8.318029628444241e-05,
"loss": 2.2662,
"step": 1108
},
{
"epoch": 0.2786957341207514,
"grad_norm": 0.6782068014144897,
"learning_rate": 8.315037768946591e-05,
"loss": 2.0514,
"step": 1109
},
{
"epoch": 0.27894703775837154,
"grad_norm": 0.17118225991725922,
"learning_rate": 8.312043789994209e-05,
"loss": 0.6076,
"step": 1110
},
{
"epoch": 0.2791983413959917,
"grad_norm": 0.598452091217041,
"learning_rate": 8.309047693501278e-05,
"loss": 1.9878,
"step": 1111
},
{
"epoch": 0.27944964503361186,
"grad_norm": 0.4604327976703644,
"learning_rate": 8.306049481383344e-05,
"loss": 2.0652,
"step": 1112
},
{
"epoch": 0.279700948671232,
"grad_norm": 0.3509052097797394,
"learning_rate": 8.3030491555573e-05,
"loss": 2.3199,
"step": 1113
},
{
"epoch": 0.2799522523088522,
"grad_norm": 0.32564830780029297,
"learning_rate": 8.300046717941387e-05,
"loss": 2.5692,
"step": 1114
},
{
"epoch": 0.28020355594647234,
"grad_norm": 0.4420936107635498,
"learning_rate": 8.297042170455207e-05,
"loss": 2.1994,
"step": 1115
},
{
"epoch": 0.2804548595840925,
"grad_norm": 0.19294236600399017,
"learning_rate": 8.294035515019699e-05,
"loss": 1.8342,
"step": 1116
},
{
"epoch": 0.28070616322171266,
"grad_norm": 0.28902173042297363,
"learning_rate": 8.291026753557154e-05,
"loss": 1.9677,
"step": 1117
},
{
"epoch": 0.2809574668593328,
"grad_norm": 0.5199251770973206,
"learning_rate": 8.288015887991214e-05,
"loss": 2.2992,
"step": 1118
},
{
"epoch": 0.2812087704969529,
"grad_norm": 0.445320725440979,
"learning_rate": 8.285002920246862e-05,
"loss": 2.0663,
"step": 1119
},
{
"epoch": 0.2814600741345731,
"grad_norm": 0.3564877510070801,
"learning_rate": 8.281987852250424e-05,
"loss": 2.447,
"step": 1120
},
{
"epoch": 0.28171137777219324,
"grad_norm": 0.4224308133125305,
"learning_rate": 8.27897068592957e-05,
"loss": 2.228,
"step": 1121
},
{
"epoch": 0.2819626814098134,
"grad_norm": 0.40947914123535156,
"learning_rate": 8.275951423213312e-05,
"loss": 2.3052,
"step": 1122
},
{
"epoch": 0.28221398504743356,
"grad_norm": 0.38864952325820923,
"learning_rate": 8.272930066032006e-05,
"loss": 2.1341,
"step": 1123
},
{
"epoch": 0.2824652886850537,
"grad_norm": 0.5043284893035889,
"learning_rate": 8.26990661631734e-05,
"loss": 1.8143,
"step": 1124
},
{
"epoch": 0.2827165923226739,
"grad_norm": 0.3912737965583801,
"learning_rate": 8.266881076002344e-05,
"loss": 2.1044,
"step": 1125
},
{
"epoch": 0.28296789596029404,
"grad_norm": 0.39619266986846924,
"learning_rate": 8.263853447021382e-05,
"loss": 2.2082,
"step": 1126
},
{
"epoch": 0.2832191995979142,
"grad_norm": 0.2806078791618347,
"learning_rate": 8.260823731310159e-05,
"loss": 2.2696,
"step": 1127
},
{
"epoch": 0.28347050323553435,
"grad_norm": 0.4805527627468109,
"learning_rate": 8.257791930805707e-05,
"loss": 2.0736,
"step": 1128
},
{
"epoch": 0.2837218068731545,
"grad_norm": 0.37303459644317627,
"learning_rate": 8.254758047446394e-05,
"loss": 2.1587,
"step": 1129
},
{
"epoch": 0.2839731105107746,
"grad_norm": 0.4668956398963928,
"learning_rate": 8.251722083171923e-05,
"loss": 1.9901,
"step": 1130
},
{
"epoch": 0.2842244141483948,
"grad_norm": 0.2532447278499603,
"learning_rate": 8.248684039923322e-05,
"loss": 2.2625,
"step": 1131
},
{
"epoch": 0.28447571778601494,
"grad_norm": 0.5377678871154785,
"learning_rate": 8.245643919642951e-05,
"loss": 2.0567,
"step": 1132
},
{
"epoch": 0.2847270214236351,
"grad_norm": 0.4332278370857239,
"learning_rate": 8.242601724274498e-05,
"loss": 2.323,
"step": 1133
},
{
"epoch": 0.28497832506125526,
"grad_norm": 0.284900039434433,
"learning_rate": 8.239557455762976e-05,
"loss": 2.0663,
"step": 1134
},
{
"epoch": 0.2852296286988754,
"grad_norm": 0.385785847902298,
"learning_rate": 8.236511116054725e-05,
"loss": 2.2765,
"step": 1135
},
{
"epoch": 0.2854809323364956,
"grad_norm": 0.3278866410255432,
"learning_rate": 8.233462707097413e-05,
"loss": 2.3421,
"step": 1136
},
{
"epoch": 0.28573223597411573,
"grad_norm": 0.31879276037216187,
"learning_rate": 8.230412230840019e-05,
"loss": 2.6483,
"step": 1137
},
{
"epoch": 0.2859835396117359,
"grad_norm": 0.41390541195869446,
"learning_rate": 8.227359689232856e-05,
"loss": 2.5977,
"step": 1138
},
{
"epoch": 0.28623484324935605,
"grad_norm": 0.657051682472229,
"learning_rate": 8.224305084227553e-05,
"loss": 2.1319,
"step": 1139
},
{
"epoch": 0.2864861468869762,
"grad_norm": 0.47445768117904663,
"learning_rate": 8.221248417777057e-05,
"loss": 2.4098,
"step": 1140
},
{
"epoch": 0.28673745052459637,
"grad_norm": 0.3382059931755066,
"learning_rate": 8.218189691835636e-05,
"loss": 2.4012,
"step": 1141
},
{
"epoch": 0.2869887541622165,
"grad_norm": 0.23410090804100037,
"learning_rate": 8.215128908358871e-05,
"loss": 0.9557,
"step": 1142
},
{
"epoch": 0.28724005779983663,
"grad_norm": 0.3863859176635742,
"learning_rate": 8.212066069303661e-05,
"loss": 2.5172,
"step": 1143
},
{
"epoch": 0.2874913614374568,
"grad_norm": 0.4687202572822571,
"learning_rate": 8.209001176628218e-05,
"loss": 2.2719,
"step": 1144
},
{
"epoch": 0.28774266507507695,
"grad_norm": 0.3292360007762909,
"learning_rate": 8.20593423229207e-05,
"loss": 2.4493,
"step": 1145
},
{
"epoch": 0.2879939687126971,
"grad_norm": 0.3099132180213928,
"learning_rate": 8.20286523825605e-05,
"loss": 2.2859,
"step": 1146
},
{
"epoch": 0.28824527235031727,
"grad_norm": 0.4771600663661957,
"learning_rate": 8.19979419648231e-05,
"loss": 2.1732,
"step": 1147
},
{
"epoch": 0.28849657598793743,
"grad_norm": 0.23781466484069824,
"learning_rate": 8.196721108934305e-05,
"loss": 2.0926,
"step": 1148
},
{
"epoch": 0.2887478796255576,
"grad_norm": 0.267600953578949,
"learning_rate": 8.1936459775768e-05,
"loss": 2.3248,
"step": 1149
},
{
"epoch": 0.28899918326317775,
"grad_norm": 0.28264153003692627,
"learning_rate": 8.190568804375867e-05,
"loss": 1.7394,
"step": 1150
},
{
"epoch": 0.2892504869007979,
"grad_norm": 0.47913792729377747,
"learning_rate": 8.187489591298883e-05,
"loss": 2.2019,
"step": 1151
},
{
"epoch": 0.28950179053841807,
"grad_norm": 0.46330931782722473,
"learning_rate": 8.184408340314528e-05,
"loss": 2.0046,
"step": 1152
},
{
"epoch": 0.2897530941760382,
"grad_norm": 0.46758121252059937,
"learning_rate": 8.181325053392788e-05,
"loss": 2.7781,
"step": 1153
},
{
"epoch": 0.29000439781365833,
"grad_norm": 0.4416895806789398,
"learning_rate": 8.178239732504949e-05,
"loss": 1.9636,
"step": 1154
},
{
"epoch": 0.2902557014512785,
"grad_norm": 0.4437691867351532,
"learning_rate": 8.175152379623597e-05,
"loss": 2.0821,
"step": 1155
},
{
"epoch": 0.29050700508889865,
"grad_norm": 0.41940081119537354,
"learning_rate": 8.172062996722615e-05,
"loss": 2.164,
"step": 1156
},
{
"epoch": 0.2907583087265188,
"grad_norm": 0.34322845935821533,
"learning_rate": 8.168971585777189e-05,
"loss": 2.3625,
"step": 1157
},
{
"epoch": 0.29100961236413897,
"grad_norm": 0.349292516708374,
"learning_rate": 8.165878148763797e-05,
"loss": 2.1895,
"step": 1158
},
{
"epoch": 0.29126091600175913,
"grad_norm": 0.4713262915611267,
"learning_rate": 8.162782687660218e-05,
"loss": 2.1822,
"step": 1159
},
{
"epoch": 0.2915122196393793,
"grad_norm": 0.498413622379303,
"learning_rate": 8.159685204445517e-05,
"loss": 1.8382,
"step": 1160
},
{
"epoch": 0.29176352327699945,
"grad_norm": 0.2707071602344513,
"learning_rate": 8.156585701100057e-05,
"loss": 1.8682,
"step": 1161
},
{
"epoch": 0.2920148269146196,
"grad_norm": 0.16994576156139374,
"learning_rate": 8.153484179605494e-05,
"loss": 1.4461,
"step": 1162
},
{
"epoch": 0.29226613055223977,
"grad_norm": 0.46193161606788635,
"learning_rate": 8.150380641944771e-05,
"loss": 2.2424,
"step": 1163
},
{
"epoch": 0.29251743418985987,
"grad_norm": 0.4411792457103729,
"learning_rate": 8.14727509010212e-05,
"loss": 2.4993,
"step": 1164
},
{
"epoch": 0.29276873782748003,
"grad_norm": 0.49905550479888916,
"learning_rate": 8.144167526063062e-05,
"loss": 2.0086,
"step": 1165
},
{
"epoch": 0.2930200414651002,
"grad_norm": 0.3959316909313202,
"learning_rate": 8.141057951814405e-05,
"loss": 2.1823,
"step": 1166
},
{
"epoch": 0.29327134510272035,
"grad_norm": 0.4969998002052307,
"learning_rate": 8.137946369344243e-05,
"loss": 1.7895,
"step": 1167
},
{
"epoch": 0.2935226487403405,
"grad_norm": 0.3000521659851074,
"learning_rate": 8.13483278064195e-05,
"loss": 2.1703,
"step": 1168
},
{
"epoch": 0.29377395237796067,
"grad_norm": 0.3654926121234894,
"learning_rate": 8.131717187698185e-05,
"loss": 2.311,
"step": 1169
},
{
"epoch": 0.2940252560155808,
"grad_norm": 0.4902850091457367,
"learning_rate": 8.128599592504891e-05,
"loss": 2.4557,
"step": 1170
},
{
"epoch": 0.294276559653201,
"grad_norm": 0.3724076449871063,
"learning_rate": 8.125479997055286e-05,
"loss": 2.0032,
"step": 1171
},
{
"epoch": 0.29452786329082115,
"grad_norm": 0.4023280739784241,
"learning_rate": 8.122358403343872e-05,
"loss": 2.2439,
"step": 1172
},
{
"epoch": 0.2947791669284413,
"grad_norm": 0.46174219250679016,
"learning_rate": 8.119234813366425e-05,
"loss": 2.0743,
"step": 1173
},
{
"epoch": 0.29503047056606146,
"grad_norm": 0.30935728549957275,
"learning_rate": 8.11610922912e-05,
"loss": 2.7031,
"step": 1174
},
{
"epoch": 0.2952817742036816,
"grad_norm": 0.3213157653808594,
"learning_rate": 8.11298165260292e-05,
"loss": 2.5725,
"step": 1175
},
{
"epoch": 0.29553307784130173,
"grad_norm": 0.5222627520561218,
"learning_rate": 8.109852085814796e-05,
"loss": 2.2112,
"step": 1176
},
{
"epoch": 0.2957843814789219,
"grad_norm": 0.3197146952152252,
"learning_rate": 8.106720530756495e-05,
"loss": 2.4653,
"step": 1177
},
{
"epoch": 0.29603568511654205,
"grad_norm": 0.34864383935928345,
"learning_rate": 8.103586989430165e-05,
"loss": 2.395,
"step": 1178
},
{
"epoch": 0.2962869887541622,
"grad_norm": 0.5509148240089417,
"learning_rate": 8.100451463839223e-05,
"loss": 2.0561,
"step": 1179
},
{
"epoch": 0.29653829239178237,
"grad_norm": 0.5259515643119812,
"learning_rate": 8.097313955988355e-05,
"loss": 2.1252,
"step": 1180
},
{
"epoch": 0.2967895960294025,
"grad_norm": 0.2766565978527069,
"learning_rate": 8.094174467883509e-05,
"loss": 1.5991,
"step": 1181
},
{
"epoch": 0.2970408996670227,
"grad_norm": 0.3643835484981537,
"learning_rate": 8.091033001531905e-05,
"loss": 1.9686,
"step": 1182
},
{
"epoch": 0.29729220330464284,
"grad_norm": 0.45411252975463867,
"learning_rate": 8.087889558942023e-05,
"loss": 2.2172,
"step": 1183
},
{
"epoch": 0.297543506942263,
"grad_norm": 0.47071462869644165,
"learning_rate": 8.084744142123613e-05,
"loss": 1.9389,
"step": 1184
},
{
"epoch": 0.29779481057988316,
"grad_norm": 0.3977017402648926,
"learning_rate": 8.081596753087682e-05,
"loss": 2.1713,
"step": 1185
},
{
"epoch": 0.2980461142175033,
"grad_norm": 0.6268634796142578,
"learning_rate": 8.0784473938465e-05,
"loss": 2.168,
"step": 1186
},
{
"epoch": 0.2982974178551234,
"grad_norm": 0.31418439745903015,
"learning_rate": 8.075296066413593e-05,
"loss": 2.0036,
"step": 1187
},
{
"epoch": 0.2985487214927436,
"grad_norm": 1.348002314567566,
"learning_rate": 8.07214277280375e-05,
"loss": 2.6528,
"step": 1188
},
{
"epoch": 0.29880002513036374,
"grad_norm": 0.2204774022102356,
"learning_rate": 8.068987515033015e-05,
"loss": 2.1439,
"step": 1189
},
{
"epoch": 0.2990513287679839,
"grad_norm": 0.34849774837493896,
"learning_rate": 8.06583029511869e-05,
"loss": 2.1119,
"step": 1190
},
{
"epoch": 0.29930263240560406,
"grad_norm": 0.3371882438659668,
"learning_rate": 8.062671115079325e-05,
"loss": 2.3992,
"step": 1191
},
{
"epoch": 0.2995539360432242,
"grad_norm": 0.5298404097557068,
"learning_rate": 8.059509976934733e-05,
"loss": 1.9743,
"step": 1192
},
{
"epoch": 0.2998052396808444,
"grad_norm": 0.33625608682632446,
"learning_rate": 8.056346882705972e-05,
"loss": 1.9329,
"step": 1193
},
{
"epoch": 0.30005654331846454,
"grad_norm": 0.3414923846721649,
"learning_rate": 8.053181834415351e-05,
"loss": 2.3464,
"step": 1194
},
{
"epoch": 0.3003078469560847,
"grad_norm": 0.49444150924682617,
"learning_rate": 8.05001483408643e-05,
"loss": 2.5747,
"step": 1195
},
{
"epoch": 0.30055915059370486,
"grad_norm": 0.4675874412059784,
"learning_rate": 8.046845883744016e-05,
"loss": 2.0696,
"step": 1196
},
{
"epoch": 0.300810454231325,
"grad_norm": 0.1863304227590561,
"learning_rate": 8.043674985414167e-05,
"loss": 1.0368,
"step": 1197
},
{
"epoch": 0.3010617578689452,
"grad_norm": 0.5033738613128662,
"learning_rate": 8.040502141124178e-05,
"loss": 2.0936,
"step": 1198
},
{
"epoch": 0.3013130615065653,
"grad_norm": 0.36377644538879395,
"learning_rate": 8.037327352902598e-05,
"loss": 1.9558,
"step": 1199
},
{
"epoch": 0.30156436514418544,
"grad_norm": 0.48326364159584045,
"learning_rate": 8.034150622779209e-05,
"loss": 2.2018,
"step": 1200
},
{
"epoch": 0.3018156687818056,
"grad_norm": 0.5234176516532898,
"learning_rate": 8.030971952785041e-05,
"loss": 2.0583,
"step": 1201
},
{
"epoch": 0.30206697241942576,
"grad_norm": 0.4377163052558899,
"learning_rate": 8.027791344952363e-05,
"loss": 2.1404,
"step": 1202
},
{
"epoch": 0.3023182760570459,
"grad_norm": 0.25891783833503723,
"learning_rate": 8.024608801314684e-05,
"loss": 1.7364,
"step": 1203
},
{
"epoch": 0.3025695796946661,
"grad_norm": 0.508296549320221,
"learning_rate": 8.021424323906745e-05,
"loss": 2.5818,
"step": 1204
},
{
"epoch": 0.30282088333228624,
"grad_norm": 0.4298109710216522,
"learning_rate": 8.018237914764533e-05,
"loss": 2.0584,
"step": 1205
},
{
"epoch": 0.3030721869699064,
"grad_norm": 0.40173277258872986,
"learning_rate": 8.01504957592526e-05,
"loss": 1.7585,
"step": 1206
},
{
"epoch": 0.30332349060752656,
"grad_norm": 0.5122236013412476,
"learning_rate": 8.011859309427379e-05,
"loss": 1.9794,
"step": 1207
},
{
"epoch": 0.3035747942451467,
"grad_norm": 0.624552309513092,
"learning_rate": 8.008667117310572e-05,
"loss": 1.8687,
"step": 1208
},
{
"epoch": 0.3038260978827669,
"grad_norm": 0.4688674807548523,
"learning_rate": 8.005473001615753e-05,
"loss": 2.1042,
"step": 1209
},
{
"epoch": 0.304077401520387,
"grad_norm": 0.4452720284461975,
"learning_rate": 8.002276964385065e-05,
"loss": 2.0988,
"step": 1210
},
{
"epoch": 0.30432870515800714,
"grad_norm": 0.4090367257595062,
"learning_rate": 7.99907900766188e-05,
"loss": 2.4699,
"step": 1211
},
{
"epoch": 0.3045800087956273,
"grad_norm": 0.5209043025970459,
"learning_rate": 7.9958791334908e-05,
"loss": 2.0238,
"step": 1212
},
{
"epoch": 0.30483131243324746,
"grad_norm": 0.2394983172416687,
"learning_rate": 7.992677343917648e-05,
"loss": 1.0293,
"step": 1213
},
{
"epoch": 0.3050826160708676,
"grad_norm": 0.38234421610832214,
"learning_rate": 7.989473640989475e-05,
"loss": 2.5856,
"step": 1214
},
{
"epoch": 0.3053339197084878,
"grad_norm": 0.3305385112762451,
"learning_rate": 7.986268026754554e-05,
"loss": 2.4238,
"step": 1215
},
{
"epoch": 0.30558522334610794,
"grad_norm": 0.3514966368675232,
"learning_rate": 7.98306050326238e-05,
"loss": 2.1778,
"step": 1216
},
{
"epoch": 0.3058365269837281,
"grad_norm": 0.3517964780330658,
"learning_rate": 7.979851072563669e-05,
"loss": 1.7162,
"step": 1217
},
{
"epoch": 0.30608783062134826,
"grad_norm": 0.3453415036201477,
"learning_rate": 7.976639736710357e-05,
"loss": 2.5491,
"step": 1218
},
{
"epoch": 0.3063391342589684,
"grad_norm": 0.4221291244029999,
"learning_rate": 7.973426497755599e-05,
"loss": 2.4673,
"step": 1219
},
{
"epoch": 0.3065904378965886,
"grad_norm": 0.4376787543296814,
"learning_rate": 7.970211357753761e-05,
"loss": 2.6587,
"step": 1220
},
{
"epoch": 0.30684174153420873,
"grad_norm": 0.31638938188552856,
"learning_rate": 7.96699431876043e-05,
"loss": 2.3988,
"step": 1221
},
{
"epoch": 0.30709304517182884,
"grad_norm": 0.4708680510520935,
"learning_rate": 7.963775382832407e-05,
"loss": 2.3329,
"step": 1222
},
{
"epoch": 0.307344348809449,
"grad_norm": 0.3276142179965973,
"learning_rate": 7.960554552027703e-05,
"loss": 2.2746,
"step": 1223
},
{
"epoch": 0.30759565244706916,
"grad_norm": 0.23761656880378723,
"learning_rate": 7.957331828405543e-05,
"loss": 2.0233,
"step": 1224
},
{
"epoch": 0.3078469560846893,
"grad_norm": 0.4292398691177368,
"learning_rate": 7.95410721402636e-05,
"loss": 1.866,
"step": 1225
},
{
"epoch": 0.3080982597223095,
"grad_norm": 0.24796533584594727,
"learning_rate": 7.9508807109518e-05,
"loss": 2.4157,
"step": 1226
},
{
"epoch": 0.30834956335992963,
"grad_norm": 0.4541628360748291,
"learning_rate": 7.947652321244709e-05,
"loss": 2.5871,
"step": 1227
},
{
"epoch": 0.3086008669975498,
"grad_norm": 3.6204922199249268,
"learning_rate": 7.944422046969146e-05,
"loss": 2.5856,
"step": 1228
},
{
"epoch": 0.30885217063516995,
"grad_norm": 0.1898091733455658,
"learning_rate": 7.941189890190375e-05,
"loss": 1.8819,
"step": 1229
},
{
"epoch": 0.3091034742727901,
"grad_norm": 0.4697364866733551,
"learning_rate": 7.937955852974856e-05,
"loss": 2.7554,
"step": 1230
},
{
"epoch": 0.3093547779104103,
"grad_norm": 0.2671608030796051,
"learning_rate": 7.934719937390258e-05,
"loss": 2.0734,
"step": 1231
},
{
"epoch": 0.30960608154803043,
"grad_norm": 0.3299066424369812,
"learning_rate": 7.931482145505452e-05,
"loss": 2.3875,
"step": 1232
},
{
"epoch": 0.30985738518565054,
"grad_norm": 0.45493796467781067,
"learning_rate": 7.928242479390504e-05,
"loss": 2.1111,
"step": 1233
},
{
"epoch": 0.3101086888232707,
"grad_norm": 0.4255753755569458,
"learning_rate": 7.925000941116681e-05,
"loss": 1.9511,
"step": 1234
},
{
"epoch": 0.31035999246089085,
"grad_norm": 0.4670938551425934,
"learning_rate": 7.921757532756444e-05,
"loss": 1.9518,
"step": 1235
},
{
"epoch": 0.310611296098511,
"grad_norm": 0.3174899220466614,
"learning_rate": 7.918512256383452e-05,
"loss": 1.8118,
"step": 1236
},
{
"epoch": 0.3108625997361312,
"grad_norm": 0.35116392374038696,
"learning_rate": 7.915265114072562e-05,
"loss": 2.0331,
"step": 1237
},
{
"epoch": 0.31111390337375133,
"grad_norm": 0.30320197343826294,
"learning_rate": 7.912016107899818e-05,
"loss": 2.3592,
"step": 1238
},
{
"epoch": 0.3113652070113715,
"grad_norm": 0.3480928838253021,
"learning_rate": 7.908765239942455e-05,
"loss": 2.5963,
"step": 1239
},
{
"epoch": 0.31161651064899165,
"grad_norm": 0.3407917618751526,
"learning_rate": 7.905512512278904e-05,
"loss": 2.2058,
"step": 1240
},
{
"epoch": 0.3118678142866118,
"grad_norm": 0.3371085226535797,
"learning_rate": 7.902257926988781e-05,
"loss": 2.2821,
"step": 1241
},
{
"epoch": 0.31211911792423197,
"grad_norm": 0.5407741069793701,
"learning_rate": 7.89900148615289e-05,
"loss": 2.5554,
"step": 1242
},
{
"epoch": 0.31237042156185213,
"grad_norm": 0.5080079436302185,
"learning_rate": 7.895743191853224e-05,
"loss": 2.5177,
"step": 1243
},
{
"epoch": 0.3126217251994723,
"grad_norm": 0.4085712432861328,
"learning_rate": 7.892483046172958e-05,
"loss": 2.4434,
"step": 1244
},
{
"epoch": 0.3128730288370924,
"grad_norm": 0.727549135684967,
"learning_rate": 7.889221051196451e-05,
"loss": 2.0379,
"step": 1245
},
{
"epoch": 0.31312433247471255,
"grad_norm": 0.4746238887310028,
"learning_rate": 7.885957209009247e-05,
"loss": 2.1352,
"step": 1246
},
{
"epoch": 0.3133756361123327,
"grad_norm": 0.370112806558609,
"learning_rate": 7.882691521698069e-05,
"loss": 2.5383,
"step": 1247
},
{
"epoch": 0.31362693974995287,
"grad_norm": 0.4031289219856262,
"learning_rate": 7.879423991350817e-05,
"loss": 1.896,
"step": 1248
},
{
"epoch": 0.31387824338757303,
"grad_norm": 0.6527341604232788,
"learning_rate": 7.876154620056573e-05,
"loss": 2.2678,
"step": 1249
},
{
"epoch": 0.3141295470251932,
"grad_norm": 0.3396470546722412,
"learning_rate": 7.872883409905599e-05,
"loss": 2.593,
"step": 1250
},
{
"epoch": 0.31438085066281335,
"grad_norm": 0.4700802266597748,
"learning_rate": 7.869610362989322e-05,
"loss": 2.459,
"step": 1251
},
{
"epoch": 0.3146321543004335,
"grad_norm": 0.4480900466442108,
"learning_rate": 7.866335481400355e-05,
"loss": 1.8601,
"step": 1252
},
{
"epoch": 0.31488345793805367,
"grad_norm": 0.43752118945121765,
"learning_rate": 7.863058767232477e-05,
"loss": 2.5687,
"step": 1253
},
{
"epoch": 0.3151347615756738,
"grad_norm": 0.4294682741165161,
"learning_rate": 7.859780222580642e-05,
"loss": 1.6714,
"step": 1254
},
{
"epoch": 0.315386065213294,
"grad_norm": 0.4277234673500061,
"learning_rate": 7.856499849540973e-05,
"loss": 1.7993,
"step": 1255
},
{
"epoch": 0.3156373688509141,
"grad_norm": 0.2997349202632904,
"learning_rate": 7.85321765021076e-05,
"loss": 2.388,
"step": 1256
},
{
"epoch": 0.31588867248853425,
"grad_norm": 0.39641648530960083,
"learning_rate": 7.849933626688464e-05,
"loss": 2.3478,
"step": 1257
},
{
"epoch": 0.3161399761261544,
"grad_norm": 0.42108651995658875,
"learning_rate": 7.846647781073712e-05,
"loss": 2.4717,
"step": 1258
},
{
"epoch": 0.31639127976377457,
"grad_norm": 0.3705635964870453,
"learning_rate": 7.843360115467293e-05,
"loss": 2.0972,
"step": 1259
},
{
"epoch": 0.31664258340139473,
"grad_norm": 0.6227510571479797,
"learning_rate": 7.840070631971163e-05,
"loss": 1.7682,
"step": 1260
},
{
"epoch": 0.3168938870390149,
"grad_norm": 0.35013604164123535,
"learning_rate": 7.836779332688436e-05,
"loss": 2.371,
"step": 1261
},
{
"epoch": 0.31714519067663505,
"grad_norm": 0.6708528995513916,
"learning_rate": 7.833486219723394e-05,
"loss": 2.0226,
"step": 1262
},
{
"epoch": 0.3173964943142552,
"grad_norm": 0.3249523639678955,
"learning_rate": 7.83019129518147e-05,
"loss": 1.9312,
"step": 1263
},
{
"epoch": 0.31764779795187537,
"grad_norm": 0.3649659752845764,
"learning_rate": 7.826894561169262e-05,
"loss": 2.6408,
"step": 1264
},
{
"epoch": 0.3178991015894955,
"grad_norm": 0.5461932420730591,
"learning_rate": 7.823596019794521e-05,
"loss": 2.23,
"step": 1265
},
{
"epoch": 0.3181504052271157,
"grad_norm": 0.48531582951545715,
"learning_rate": 7.820295673166155e-05,
"loss": 2.0363,
"step": 1266
},
{
"epoch": 0.31840170886473584,
"grad_norm": 0.3750603199005127,
"learning_rate": 7.81699352339423e-05,
"loss": 2.4194,
"step": 1267
},
{
"epoch": 0.31865301250235595,
"grad_norm": 0.6479251384735107,
"learning_rate": 7.813689572589952e-05,
"loss": 2.3267,
"step": 1268
},
{
"epoch": 0.3189043161399761,
"grad_norm": 0.3576231896877289,
"learning_rate": 7.810383822865697e-05,
"loss": 2.2102,
"step": 1269
},
{
"epoch": 0.31915561977759627,
"grad_norm": 0.3646045923233032,
"learning_rate": 7.807076276334973e-05,
"loss": 1.9242,
"step": 1270
},
{
"epoch": 0.3194069234152164,
"grad_norm": 0.6005945205688477,
"learning_rate": 7.803766935112452e-05,
"loss": 1.7235,
"step": 1271
},
{
"epoch": 0.3196582270528366,
"grad_norm": 0.4535173773765564,
"learning_rate": 7.800455801313943e-05,
"loss": 1.8387,
"step": 1272
},
{
"epoch": 0.31990953069045674,
"grad_norm": 0.30238354206085205,
"learning_rate": 7.797142877056406e-05,
"loss": 1.626,
"step": 1273
},
{
"epoch": 0.3201608343280769,
"grad_norm": 0.4098001718521118,
"learning_rate": 7.793828164457944e-05,
"loss": 1.9611,
"step": 1274
},
{
"epoch": 0.32041213796569706,
"grad_norm": 0.3639359772205353,
"learning_rate": 7.790511665637803e-05,
"loss": 2.1905,
"step": 1275
},
{
"epoch": 0.3206634416033172,
"grad_norm": 0.3771366477012634,
"learning_rate": 7.787193382716374e-05,
"loss": 2.0498,
"step": 1276
},
{
"epoch": 0.3209147452409374,
"grad_norm": 0.22171245515346527,
"learning_rate": 7.783873317815184e-05,
"loss": 1.7027,
"step": 1277
},
{
"epoch": 0.32116604887855754,
"grad_norm": 0.5142925381660461,
"learning_rate": 7.780551473056904e-05,
"loss": 2.4564,
"step": 1278
},
{
"epoch": 0.32141735251617765,
"grad_norm": 0.3985360264778137,
"learning_rate": 7.777227850565341e-05,
"loss": 2.3719,
"step": 1279
},
{
"epoch": 0.3216686561537978,
"grad_norm": 0.48328259587287903,
"learning_rate": 7.773902452465436e-05,
"loss": 2.405,
"step": 1280
},
{
"epoch": 0.32191995979141796,
"grad_norm": 0.455669641494751,
"learning_rate": 7.77057528088327e-05,
"loss": 2.2332,
"step": 1281
},
{
"epoch": 0.3221712634290381,
"grad_norm": 0.2635050415992737,
"learning_rate": 7.767246337946053e-05,
"loss": 2.0382,
"step": 1282
},
{
"epoch": 0.3224225670666583,
"grad_norm": 0.46467408537864685,
"learning_rate": 7.763915625782132e-05,
"loss": 2.1012,
"step": 1283
},
{
"epoch": 0.32267387070427844,
"grad_norm": 0.44746190309524536,
"learning_rate": 7.76058314652098e-05,
"loss": 1.9779,
"step": 1284
},
{
"epoch": 0.3229251743418986,
"grad_norm": 0.4189201295375824,
"learning_rate": 7.757248902293209e-05,
"loss": 2.7604,
"step": 1285
},
{
"epoch": 0.32317647797951876,
"grad_norm": 0.35619938373565674,
"learning_rate": 7.753912895230546e-05,
"loss": 1.6188,
"step": 1286
},
{
"epoch": 0.3234277816171389,
"grad_norm": 0.31852611899375916,
"learning_rate": 7.750575127465858e-05,
"loss": 2.2916,
"step": 1287
},
{
"epoch": 0.3236790852547591,
"grad_norm": 0.427613765001297,
"learning_rate": 7.747235601133129e-05,
"loss": 1.7733,
"step": 1288
},
{
"epoch": 0.32393038889237924,
"grad_norm": 0.4661354720592499,
"learning_rate": 7.74389431836747e-05,
"loss": 2.4934,
"step": 1289
},
{
"epoch": 0.3241816925299994,
"grad_norm": 0.2827777862548828,
"learning_rate": 7.740551281305119e-05,
"loss": 1.927,
"step": 1290
},
{
"epoch": 0.3244329961676195,
"grad_norm": 0.37848183512687683,
"learning_rate": 7.737206492083428e-05,
"loss": 2.2857,
"step": 1291
},
{
"epoch": 0.32468429980523966,
"grad_norm": 0.4958799183368683,
"learning_rate": 7.733859952840875e-05,
"loss": 2.1101,
"step": 1292
},
{
"epoch": 0.3249356034428598,
"grad_norm": 0.27771416306495667,
"learning_rate": 7.730511665717056e-05,
"loss": 2.6824,
"step": 1293
},
{
"epoch": 0.32518690708048,
"grad_norm": 0.27138715982437134,
"learning_rate": 7.72716163285268e-05,
"loss": 1.9027,
"step": 1294
},
{
"epoch": 0.32543821071810014,
"grad_norm": 0.40234288573265076,
"learning_rate": 7.723809856389578e-05,
"loss": 1.9377,
"step": 1295
},
{
"epoch": 0.3256895143557203,
"grad_norm": 0.45054858922958374,
"learning_rate": 7.720456338470696e-05,
"loss": 2.0891,
"step": 1296
},
{
"epoch": 0.32594081799334046,
"grad_norm": 0.28506696224212646,
"learning_rate": 7.717101081240087e-05,
"loss": 2.0881,
"step": 1297
},
{
"epoch": 0.3261921216309606,
"grad_norm": 0.621242880821228,
"learning_rate": 7.713744086842921e-05,
"loss": 2.3001,
"step": 1298
},
{
"epoch": 0.3264434252685808,
"grad_norm": 0.3025634288787842,
"learning_rate": 7.710385357425479e-05,
"loss": 2.4073,
"step": 1299
},
{
"epoch": 0.32669472890620094,
"grad_norm": 0.4620816707611084,
"learning_rate": 7.707024895135147e-05,
"loss": 2.1442,
"step": 1300
},
{
"epoch": 0.3269460325438211,
"grad_norm": 0.47035887837409973,
"learning_rate": 7.703662702120424e-05,
"loss": 2.2738,
"step": 1301
},
{
"epoch": 0.3271973361814412,
"grad_norm": 0.4486905038356781,
"learning_rate": 7.700298780530913e-05,
"loss": 2.5139,
"step": 1302
},
{
"epoch": 0.32744863981906136,
"grad_norm": 0.459060400724411,
"learning_rate": 7.69693313251732e-05,
"loss": 2.2898,
"step": 1303
},
{
"epoch": 0.3276999434566815,
"grad_norm": 0.3948078155517578,
"learning_rate": 7.693565760231461e-05,
"loss": 2.0639,
"step": 1304
},
{
"epoch": 0.3279512470943017,
"grad_norm": 0.45571649074554443,
"learning_rate": 7.690196665826247e-05,
"loss": 2.3439,
"step": 1305
},
{
"epoch": 0.32820255073192184,
"grad_norm": 0.45678454637527466,
"learning_rate": 7.686825851455695e-05,
"loss": 2.479,
"step": 1306
},
{
"epoch": 0.328453854369542,
"grad_norm": 0.3192417025566101,
"learning_rate": 7.683453319274922e-05,
"loss": 2.2833,
"step": 1307
},
{
"epoch": 0.32870515800716216,
"grad_norm": 0.3500188887119293,
"learning_rate": 7.68007907144014e-05,
"loss": 2.1342,
"step": 1308
},
{
"epoch": 0.3289564616447823,
"grad_norm": 0.38774073123931885,
"learning_rate": 7.676703110108658e-05,
"loss": 2.2442,
"step": 1309
},
{
"epoch": 0.3292077652824025,
"grad_norm": 0.4461459219455719,
"learning_rate": 7.673325437438885e-05,
"loss": 2.4629,
"step": 1310
},
{
"epoch": 0.32945906892002264,
"grad_norm": 0.42307528853416443,
"learning_rate": 7.669946055590318e-05,
"loss": 2.6043,
"step": 1311
},
{
"epoch": 0.3297103725576428,
"grad_norm": 0.3208043575286865,
"learning_rate": 7.666564966723552e-05,
"loss": 2.4378,
"step": 1312
},
{
"epoch": 0.32996167619526295,
"grad_norm": 0.42787060141563416,
"learning_rate": 7.663182173000269e-05,
"loss": 2.2689,
"step": 1313
},
{
"epoch": 0.33021297983288306,
"grad_norm": 0.42350679636001587,
"learning_rate": 7.659797676583247e-05,
"loss": 2.142,
"step": 1314
},
{
"epoch": 0.3304642834705032,
"grad_norm": 0.3012787699699402,
"learning_rate": 7.656411479636345e-05,
"loss": 2.1637,
"step": 1315
},
{
"epoch": 0.3307155871081234,
"grad_norm": 0.4277016222476959,
"learning_rate": 7.653023584324516e-05,
"loss": 2.0439,
"step": 1316
},
{
"epoch": 0.33096689074574354,
"grad_norm": 0.1811779886484146,
"learning_rate": 7.649633992813793e-05,
"loss": 0.9751,
"step": 1317
},
{
"epoch": 0.3312181943833637,
"grad_norm": 0.4230121672153473,
"learning_rate": 7.646242707271298e-05,
"loss": 2.6928,
"step": 1318
},
{
"epoch": 0.33146949802098385,
"grad_norm": 0.3608384132385254,
"learning_rate": 7.642849729865235e-05,
"loss": 2.3808,
"step": 1319
},
{
"epoch": 0.331720801658604,
"grad_norm": 0.4780106842517853,
"learning_rate": 7.639455062764888e-05,
"loss": 1.6927,
"step": 1320
},
{
"epoch": 0.3319721052962242,
"grad_norm": 0.3628620505332947,
"learning_rate": 7.636058708140622e-05,
"loss": 2.1814,
"step": 1321
},
{
"epoch": 0.33222340893384433,
"grad_norm": 0.28604358434677124,
"learning_rate": 7.632660668163882e-05,
"loss": 2.0401,
"step": 1322
},
{
"epoch": 0.3324747125714645,
"grad_norm": 0.29259243607521057,
"learning_rate": 7.62926094500719e-05,
"loss": 2.0084,
"step": 1323
},
{
"epoch": 0.33272601620908465,
"grad_norm": 0.38667431473731995,
"learning_rate": 7.625859540844144e-05,
"loss": 2.2576,
"step": 1324
},
{
"epoch": 0.33297731984670476,
"grad_norm": 0.5796361565589905,
"learning_rate": 7.622456457849417e-05,
"loss": 2.3307,
"step": 1325
},
{
"epoch": 0.3332286234843249,
"grad_norm": 0.37588635087013245,
"learning_rate": 7.619051698198752e-05,
"loss": 2.2745,
"step": 1326
},
{
"epoch": 0.3334799271219451,
"grad_norm": 0.15837115049362183,
"learning_rate": 7.615645264068971e-05,
"loss": 0.8862,
"step": 1327
},
{
"epoch": 0.33373123075956523,
"grad_norm": 0.6717337965965271,
"learning_rate": 7.612237157637962e-05,
"loss": 2.4251,
"step": 1328
},
{
"epoch": 0.3339825343971854,
"grad_norm": 0.2916143834590912,
"learning_rate": 7.608827381084682e-05,
"loss": 2.4426,
"step": 1329
},
{
"epoch": 0.33423383803480555,
"grad_norm": 0.4805827736854553,
"learning_rate": 7.605415936589155e-05,
"loss": 1.997,
"step": 1330
},
{
"epoch": 0.3344851416724257,
"grad_norm": 0.3782505989074707,
"learning_rate": 7.602002826332475e-05,
"loss": 1.9675,
"step": 1331
},
{
"epoch": 0.33473644531004587,
"grad_norm": 0.38300177454948425,
"learning_rate": 7.598588052496799e-05,
"loss": 2.6077,
"step": 1332
},
{
"epoch": 0.33498774894766603,
"grad_norm": 0.845617949962616,
"learning_rate": 7.595171617265347e-05,
"loss": 2.7052,
"step": 1333
},
{
"epoch": 0.3352390525852862,
"grad_norm": 0.4692726135253906,
"learning_rate": 7.5917535228224e-05,
"loss": 1.9976,
"step": 1334
},
{
"epoch": 0.33549035622290635,
"grad_norm": 0.42371514439582825,
"learning_rate": 7.588333771353305e-05,
"loss": 1.9521,
"step": 1335
},
{
"epoch": 0.33574165986052645,
"grad_norm": 0.34298497438430786,
"learning_rate": 7.584912365044463e-05,
"loss": 2.3475,
"step": 1336
},
{
"epoch": 0.3359929634981466,
"grad_norm": 0.4288291931152344,
"learning_rate": 7.581489306083334e-05,
"loss": 2.4001,
"step": 1337
},
{
"epoch": 0.3362442671357668,
"grad_norm": 0.371548056602478,
"learning_rate": 7.578064596658438e-05,
"loss": 2.0676,
"step": 1338
},
{
"epoch": 0.33649557077338693,
"grad_norm": 0.18411099910736084,
"learning_rate": 7.574638238959345e-05,
"loss": 1.4833,
"step": 1339
},
{
"epoch": 0.3367468744110071,
"grad_norm": 0.4732882082462311,
"learning_rate": 7.571210235176684e-05,
"loss": 2.3069,
"step": 1340
},
{
"epoch": 0.33699817804862725,
"grad_norm": 0.38196468353271484,
"learning_rate": 7.567780587502134e-05,
"loss": 2.331,
"step": 1341
},
{
"epoch": 0.3372494816862474,
"grad_norm": 0.41077175736427307,
"learning_rate": 7.564349298128423e-05,
"loss": 2.0708,
"step": 1342
},
{
"epoch": 0.33750078532386757,
"grad_norm": 0.21946145594120026,
"learning_rate": 7.560916369249333e-05,
"loss": 1.8817,
"step": 1343
},
{
"epoch": 0.33775208896148773,
"grad_norm": 0.2860282361507416,
"learning_rate": 7.557481803059692e-05,
"loss": 2.4348,
"step": 1344
},
{
"epoch": 0.3380033925991079,
"grad_norm": 0.8802721500396729,
"learning_rate": 7.554045601755371e-05,
"loss": 2.3791,
"step": 1345
},
{
"epoch": 0.33825469623672805,
"grad_norm": 0.3701392412185669,
"learning_rate": 7.550607767533294e-05,
"loss": 2.6149,
"step": 1346
},
{
"epoch": 0.3385059998743482,
"grad_norm": 0.3857259154319763,
"learning_rate": 7.547168302591424e-05,
"loss": 2.5165,
"step": 1347
},
{
"epoch": 0.3387573035119683,
"grad_norm": 0.3372226357460022,
"learning_rate": 7.543727209128768e-05,
"loss": 1.8363,
"step": 1348
},
{
"epoch": 0.33900860714958847,
"grad_norm": 0.27592962980270386,
"learning_rate": 7.540284489345372e-05,
"loss": 2.3205,
"step": 1349
},
{
"epoch": 0.33925991078720863,
"grad_norm": 0.3359019160270691,
"learning_rate": 7.536840145442327e-05,
"loss": 1.7028,
"step": 1350
},
{
"epoch": 0.3395112144248288,
"grad_norm": 0.21417422592639923,
"learning_rate": 7.533394179621756e-05,
"loss": 1.8902,
"step": 1351
},
{
"epoch": 0.33976251806244895,
"grad_norm": 0.37822896242141724,
"learning_rate": 7.529946594086823e-05,
"loss": 2.5567,
"step": 1352
},
{
"epoch": 0.3400138217000691,
"grad_norm": 0.3702054023742676,
"learning_rate": 7.526497391041727e-05,
"loss": 1.854,
"step": 1353
},
{
"epoch": 0.34026512533768927,
"grad_norm": 0.2901824414730072,
"learning_rate": 7.523046572691699e-05,
"loss": 2.1113,
"step": 1354
},
{
"epoch": 0.3405164289753094,
"grad_norm": 0.4038754403591156,
"learning_rate": 7.519594141243007e-05,
"loss": 2.4576,
"step": 1355
},
{
"epoch": 0.3407677326129296,
"grad_norm": 0.5765335559844971,
"learning_rate": 7.516140098902946e-05,
"loss": 2.6652,
"step": 1356
},
{
"epoch": 0.34101903625054975,
"grad_norm": 0.3066706657409668,
"learning_rate": 7.512684447879845e-05,
"loss": 2.2157,
"step": 1357
},
{
"epoch": 0.3412703398881699,
"grad_norm": 0.4212114214897156,
"learning_rate": 7.509227190383057e-05,
"loss": 2.747,
"step": 1358
},
{
"epoch": 0.34152164352579,
"grad_norm": 0.5156689882278442,
"learning_rate": 7.505768328622964e-05,
"loss": 2.0679,
"step": 1359
},
{
"epoch": 0.34177294716341017,
"grad_norm": 0.43117931485176086,
"learning_rate": 7.502307864810973e-05,
"loss": 2.1347,
"step": 1360
},
{
"epoch": 0.3420242508010303,
"grad_norm": 0.5635091662406921,
"learning_rate": 7.498845801159521e-05,
"loss": 2.6811,
"step": 1361
},
{
"epoch": 0.3422755544386505,
"grad_norm": 0.3573296070098877,
"learning_rate": 7.49538213988206e-05,
"loss": 2.2536,
"step": 1362
},
{
"epoch": 0.34252685807627065,
"grad_norm": 0.4179854094982147,
"learning_rate": 7.491916883193065e-05,
"loss": 1.9621,
"step": 1363
},
{
"epoch": 0.3427781617138908,
"grad_norm": 0.2795206308364868,
"learning_rate": 7.488450033308034e-05,
"loss": 1.8347,
"step": 1364
},
{
"epoch": 0.34302946535151096,
"grad_norm": 0.48083072900772095,
"learning_rate": 7.484981592443483e-05,
"loss": 1.8598,
"step": 1365
},
{
"epoch": 0.3432807689891311,
"grad_norm": 0.4337339401245117,
"learning_rate": 7.481511562816943e-05,
"loss": 2.1042,
"step": 1366
},
{
"epoch": 0.3435320726267513,
"grad_norm": 0.3307173550128937,
"learning_rate": 7.478039946646963e-05,
"loss": 2.6296,
"step": 1367
},
{
"epoch": 0.34378337626437144,
"grad_norm": 0.39121687412261963,
"learning_rate": 7.474566746153105e-05,
"loss": 2.0115,
"step": 1368
},
{
"epoch": 0.3440346799019916,
"grad_norm": 0.3068203628063202,
"learning_rate": 7.471091963555946e-05,
"loss": 2.3176,
"step": 1369
},
{
"epoch": 0.34428598353961176,
"grad_norm": 0.42959433794021606,
"learning_rate": 7.467615601077071e-05,
"loss": 2.2989,
"step": 1370
},
{
"epoch": 0.34453728717723187,
"grad_norm": 0.45218074321746826,
"learning_rate": 7.464137660939076e-05,
"loss": 2.1487,
"step": 1371
},
{
"epoch": 0.344788590814852,
"grad_norm": 0.3469322919845581,
"learning_rate": 7.460658145365566e-05,
"loss": 2.48,
"step": 1372
},
{
"epoch": 0.3450398944524722,
"grad_norm": 0.28552716970443726,
"learning_rate": 7.45717705658116e-05,
"loss": 2.0998,
"step": 1373
},
{
"epoch": 0.34529119809009234,
"grad_norm": 0.5254274010658264,
"learning_rate": 7.453694396811469e-05,
"loss": 1.8048,
"step": 1374
},
{
"epoch": 0.3455425017277125,
"grad_norm": 0.41990652680397034,
"learning_rate": 7.450210168283122e-05,
"loss": 1.6255,
"step": 1375
},
{
"epoch": 0.34579380536533266,
"grad_norm": 0.353322833776474,
"learning_rate": 7.446724373223743e-05,
"loss": 1.9041,
"step": 1376
},
{
"epoch": 0.3460451090029528,
"grad_norm": 0.347417414188385,
"learning_rate": 7.443237013861958e-05,
"loss": 2.1782,
"step": 1377
},
{
"epoch": 0.346296412640573,
"grad_norm": 0.2713114619255066,
"learning_rate": 7.439748092427397e-05,
"loss": 2.09,
"step": 1378
},
{
"epoch": 0.34654771627819314,
"grad_norm": 0.5424685478210449,
"learning_rate": 7.436257611150685e-05,
"loss": 2.2003,
"step": 1379
},
{
"epoch": 0.3467990199158133,
"grad_norm": 0.17719517648220062,
"learning_rate": 7.432765572263447e-05,
"loss": 1.0122,
"step": 1380
},
{
"epoch": 0.34705032355343346,
"grad_norm": 0.33825281262397766,
"learning_rate": 7.429271977998303e-05,
"loss": 2.3135,
"step": 1381
},
{
"epoch": 0.34730162719105356,
"grad_norm": 0.5347048044204712,
"learning_rate": 7.425776830588864e-05,
"loss": 2.0274,
"step": 1382
},
{
"epoch": 0.3475529308286737,
"grad_norm": 0.5033213496208191,
"learning_rate": 7.422280132269741e-05,
"loss": 2.328,
"step": 1383
},
{
"epoch": 0.3478042344662939,
"grad_norm": 0.5650802850723267,
"learning_rate": 7.41878188527653e-05,
"loss": 2.6224,
"step": 1384
},
{
"epoch": 0.34805553810391404,
"grad_norm": 0.2062671184539795,
"learning_rate": 7.415282091845822e-05,
"loss": 0.8484,
"step": 1385
},
{
"epoch": 0.3483068417415342,
"grad_norm": 0.3416356146335602,
"learning_rate": 7.411780754215189e-05,
"loss": 2.2545,
"step": 1386
},
{
"epoch": 0.34855814537915436,
"grad_norm": 0.3872370719909668,
"learning_rate": 7.408277874623202e-05,
"loss": 2.5386,
"step": 1387
},
{
"epoch": 0.3488094490167745,
"grad_norm": 0.42746737599372864,
"learning_rate": 7.404773455309411e-05,
"loss": 2.4253,
"step": 1388
},
{
"epoch": 0.3490607526543947,
"grad_norm": 0.36025798320770264,
"learning_rate": 7.401267498514347e-05,
"loss": 2.7584,
"step": 1389
},
{
"epoch": 0.34931205629201484,
"grad_norm": 0.374305784702301,
"learning_rate": 7.397760006479531e-05,
"loss": 2.4407,
"step": 1390
},
{
"epoch": 0.349563359929635,
"grad_norm": 2.065537929534912,
"learning_rate": 7.394250981447462e-05,
"loss": 2.1375,
"step": 1391
},
{
"epoch": 0.34981466356725516,
"grad_norm": 0.2785825729370117,
"learning_rate": 7.390740425661619e-05,
"loss": 2.6172,
"step": 1392
},
{
"epoch": 0.3500659672048753,
"grad_norm": 0.45189452171325684,
"learning_rate": 7.387228341366461e-05,
"loss": 2.7076,
"step": 1393
},
{
"epoch": 0.3503172708424954,
"grad_norm": 0.4751068949699402,
"learning_rate": 7.383714730807423e-05,
"loss": 1.9378,
"step": 1394
},
{
"epoch": 0.3505685744801156,
"grad_norm": 0.4904008209705353,
"learning_rate": 7.380199596230916e-05,
"loss": 2.4684,
"step": 1395
},
{
"epoch": 0.35081987811773574,
"grad_norm": 0.36860087513923645,
"learning_rate": 7.376682939884328e-05,
"loss": 2.3451,
"step": 1396
},
{
"epoch": 0.3510711817553559,
"grad_norm": 0.4491627514362335,
"learning_rate": 7.373164764016014e-05,
"loss": 2.1021,
"step": 1397
},
{
"epoch": 0.35132248539297606,
"grad_norm": 0.5599209666252136,
"learning_rate": 7.369645070875307e-05,
"loss": 2.2324,
"step": 1398
},
{
"epoch": 0.3515737890305962,
"grad_norm": 0.3067469298839569,
"learning_rate": 7.366123862712506e-05,
"loss": 2.4792,
"step": 1399
},
{
"epoch": 0.3518250926682164,
"grad_norm": 0.41154590249061584,
"learning_rate": 7.362601141778881e-05,
"loss": 2.3062,
"step": 1400
},
{
"epoch": 0.35207639630583654,
"grad_norm": 0.46394750475883484,
"learning_rate": 7.359076910326667e-05,
"loss": 1.8677,
"step": 1401
},
{
"epoch": 0.3523276999434567,
"grad_norm": 0.3576370179653168,
"learning_rate": 7.355551170609066e-05,
"loss": 2.0856,
"step": 1402
},
{
"epoch": 0.35257900358107686,
"grad_norm": 0.3749783933162689,
"learning_rate": 7.352023924880245e-05,
"loss": 2.2229,
"step": 1403
},
{
"epoch": 0.352830307218697,
"grad_norm": 0.47458767890930176,
"learning_rate": 7.348495175395331e-05,
"loss": 2.0969,
"step": 1404
},
{
"epoch": 0.3530816108563171,
"grad_norm": 0.5009230375289917,
"learning_rate": 7.344964924410415e-05,
"loss": 2.104,
"step": 1405
},
{
"epoch": 0.3533329144939373,
"grad_norm": 0.3183805048465729,
"learning_rate": 7.341433174182551e-05,
"loss": 2.2145,
"step": 1406
},
{
"epoch": 0.35358421813155744,
"grad_norm": 0.30304932594299316,
"learning_rate": 7.337899926969741e-05,
"loss": 2.36,
"step": 1407
},
{
"epoch": 0.3538355217691776,
"grad_norm": 0.402339905500412,
"learning_rate": 7.334365185030958e-05,
"loss": 2.8444,
"step": 1408
},
{
"epoch": 0.35408682540679776,
"grad_norm": 0.3983165919780731,
"learning_rate": 7.330828950626118e-05,
"loss": 2.0626,
"step": 1409
},
{
"epoch": 0.3543381290444179,
"grad_norm": 0.36163392663002014,
"learning_rate": 7.3272912260161e-05,
"loss": 2.5673,
"step": 1410
},
{
"epoch": 0.3545894326820381,
"grad_norm": 0.28790605068206787,
"learning_rate": 7.323752013462731e-05,
"loss": 2.0637,
"step": 1411
},
{
"epoch": 0.35484073631965823,
"grad_norm": 0.3295561373233795,
"learning_rate": 7.320211315228794e-05,
"loss": 2.2175,
"step": 1412
},
{
"epoch": 0.3550920399572784,
"grad_norm": 0.2871148884296417,
"learning_rate": 7.316669133578014e-05,
"loss": 1.4072,
"step": 1413
},
{
"epoch": 0.35534334359489855,
"grad_norm": 0.4259054958820343,
"learning_rate": 7.313125470775072e-05,
"loss": 2.1157,
"step": 1414
},
{
"epoch": 0.3555946472325187,
"grad_norm": 0.3132629692554474,
"learning_rate": 7.309580329085593e-05,
"loss": 2.3555,
"step": 1415
},
{
"epoch": 0.35584595087013887,
"grad_norm": 0.5399345755577087,
"learning_rate": 7.306033710776147e-05,
"loss": 2.1951,
"step": 1416
},
{
"epoch": 0.356097254507759,
"grad_norm": 0.22800098359584808,
"learning_rate": 7.30248561811425e-05,
"loss": 1.8473,
"step": 1417
},
{
"epoch": 0.35634855814537914,
"grad_norm": 0.30099812150001526,
"learning_rate": 7.298936053368358e-05,
"loss": 2.2015,
"step": 1418
},
{
"epoch": 0.3565998617829993,
"grad_norm": 0.41763341426849365,
"learning_rate": 7.29538501880787e-05,
"loss": 1.8453,
"step": 1419
},
{
"epoch": 0.35685116542061945,
"grad_norm": 0.381164014339447,
"learning_rate": 7.291832516703125e-05,
"loss": 2.1346,
"step": 1420
},
{
"epoch": 0.3571024690582396,
"grad_norm": 0.44966503977775574,
"learning_rate": 7.288278549325398e-05,
"loss": 2.288,
"step": 1421
},
{
"epoch": 0.3573537726958598,
"grad_norm": 0.41181033849716187,
"learning_rate": 7.284723118946902e-05,
"loss": 2.0504,
"step": 1422
},
{
"epoch": 0.35760507633347993,
"grad_norm": 0.1352885365486145,
"learning_rate": 7.281166227840787e-05,
"loss": 0.9655,
"step": 1423
},
{
"epoch": 0.3578563799711001,
"grad_norm": 0.4056221544742584,
"learning_rate": 7.277607878281138e-05,
"loss": 2.4637,
"step": 1424
},
{
"epoch": 0.35810768360872025,
"grad_norm": 0.7632689476013184,
"learning_rate": 7.274048072542965e-05,
"loss": 2.2485,
"step": 1425
},
{
"epoch": 0.3583589872463404,
"grad_norm": 0.42599251866340637,
"learning_rate": 7.270486812902216e-05,
"loss": 2.4106,
"step": 1426
},
{
"epoch": 0.35861029088396057,
"grad_norm": 0.3249692916870117,
"learning_rate": 7.266924101635767e-05,
"loss": 2.173,
"step": 1427
},
{
"epoch": 0.3588615945215807,
"grad_norm": 0.3229914605617523,
"learning_rate": 7.263359941021423e-05,
"loss": 2.5058,
"step": 1428
},
{
"epoch": 0.35911289815920083,
"grad_norm": 0.5926707983016968,
"learning_rate": 7.259794333337914e-05,
"loss": 2.0133,
"step": 1429
},
{
"epoch": 0.359364201796821,
"grad_norm": 0.3878900408744812,
"learning_rate": 7.256227280864892e-05,
"loss": 1.943,
"step": 1430
},
{
"epoch": 0.35961550543444115,
"grad_norm": 0.34182676672935486,
"learning_rate": 7.252658785882942e-05,
"loss": 2.4174,
"step": 1431
},
{
"epoch": 0.3598668090720613,
"grad_norm": 0.3920558989048004,
"learning_rate": 7.24908885067356e-05,
"loss": 2.5771,
"step": 1432
},
{
"epoch": 0.36011811270968147,
"grad_norm": 0.26655280590057373,
"learning_rate": 7.24551747751917e-05,
"loss": 2.4015,
"step": 1433
},
{
"epoch": 0.36036941634730163,
"grad_norm": 0.6898893713951111,
"learning_rate": 7.241944668703114e-05,
"loss": 2.2238,
"step": 1434
},
{
"epoch": 0.3606207199849218,
"grad_norm": 0.2271176278591156,
"learning_rate": 7.238370426509653e-05,
"loss": 1.8,
"step": 1435
},
{
"epoch": 0.36087202362254195,
"grad_norm": 0.5071490406990051,
"learning_rate": 7.23479475322396e-05,
"loss": 2.2468,
"step": 1436
},
{
"epoch": 0.3611233272601621,
"grad_norm": 0.4647441804409027,
"learning_rate": 7.231217651132129e-05,
"loss": 2.2595,
"step": 1437
},
{
"epoch": 0.36137463089778227,
"grad_norm": 0.6888497471809387,
"learning_rate": 7.227639122521162e-05,
"loss": 1.8527,
"step": 1438
},
{
"epoch": 0.3616259345354024,
"grad_norm": 0.31782910227775574,
"learning_rate": 7.224059169678976e-05,
"loss": 2.5325,
"step": 1439
},
{
"epoch": 0.36187723817302253,
"grad_norm": 0.39101526141166687,
"learning_rate": 7.2204777948944e-05,
"loss": 2.0058,
"step": 1440
},
{
"epoch": 0.3621285418106427,
"grad_norm": 0.21498848497867584,
"learning_rate": 7.216895000457166e-05,
"loss": 2.092,
"step": 1441
},
{
"epoch": 0.36237984544826285,
"grad_norm": 0.3530738949775696,
"learning_rate": 7.213310788657922e-05,
"loss": 1.957,
"step": 1442
},
{
"epoch": 0.362631149085883,
"grad_norm": 0.2858533263206482,
"learning_rate": 7.209725161788217e-05,
"loss": 2.4624,
"step": 1443
},
{
"epoch": 0.36288245272350317,
"grad_norm": 0.512286365032196,
"learning_rate": 7.206138122140503e-05,
"loss": 2.2276,
"step": 1444
},
{
"epoch": 0.3631337563611233,
"grad_norm": 0.3804178833961487,
"learning_rate": 7.202549672008141e-05,
"loss": 2.1131,
"step": 1445
},
{
"epoch": 0.3633850599987435,
"grad_norm": 0.5225626230239868,
"learning_rate": 7.198959813685388e-05,
"loss": 2.1261,
"step": 1446
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.43813782930374146,
"learning_rate": 7.195368549467407e-05,
"loss": 1.9948,
"step": 1447
},
{
"epoch": 0.3638876672739838,
"grad_norm": 0.37537580728530884,
"learning_rate": 7.191775881650256e-05,
"loss": 2.3365,
"step": 1448
},
{
"epoch": 0.36413897091160397,
"grad_norm": 0.20629999041557312,
"learning_rate": 7.188181812530888e-05,
"loss": 1.0852,
"step": 1449
},
{
"epoch": 0.3643902745492241,
"grad_norm": 0.36247143149375916,
"learning_rate": 7.18458634440716e-05,
"loss": 2.2337,
"step": 1450
},
{
"epoch": 0.36464157818684423,
"grad_norm": 0.30473005771636963,
"learning_rate": 7.180989479577817e-05,
"loss": 1.8102,
"step": 1451
},
{
"epoch": 0.3648928818244644,
"grad_norm": 0.4480074942111969,
"learning_rate": 7.177391220342497e-05,
"loss": 2.5071,
"step": 1452
},
{
"epoch": 0.36514418546208455,
"grad_norm": 0.40253522992134094,
"learning_rate": 7.173791569001734e-05,
"loss": 2.6847,
"step": 1453
},
{
"epoch": 0.3653954890997047,
"grad_norm": 0.42434608936309814,
"learning_rate": 7.170190527856944e-05,
"loss": 2.1038,
"step": 1454
},
{
"epoch": 0.36564679273732487,
"grad_norm": 0.3165349066257477,
"learning_rate": 7.166588099210442e-05,
"loss": 2.7536,
"step": 1455
},
{
"epoch": 0.365898096374945,
"grad_norm": 0.36183032393455505,
"learning_rate": 7.162984285365424e-05,
"loss": 2.3763,
"step": 1456
},
{
"epoch": 0.3661494000125652,
"grad_norm": 0.17733515799045563,
"learning_rate": 7.15937908862597e-05,
"loss": 1.2717,
"step": 1457
},
{
"epoch": 0.36640070365018534,
"grad_norm": 0.49144411087036133,
"learning_rate": 7.155772511297051e-05,
"loss": 2.0034,
"step": 1458
},
{
"epoch": 0.3666520072878055,
"grad_norm": 0.31911566853523254,
"learning_rate": 7.15216455568451e-05,
"loss": 1.8858,
"step": 1459
},
{
"epoch": 0.36690331092542566,
"grad_norm": 0.35006940364837646,
"learning_rate": 7.148555224095083e-05,
"loss": 2.4235,
"step": 1460
},
{
"epoch": 0.3671546145630458,
"grad_norm": 0.21370449662208557,
"learning_rate": 7.144944518836377e-05,
"loss": 1.9213,
"step": 1461
},
{
"epoch": 0.367405918200666,
"grad_norm": 0.5415109395980835,
"learning_rate": 7.14133244221688e-05,
"loss": 2.3867,
"step": 1462
},
{
"epoch": 0.3676572218382861,
"grad_norm": 0.3929027318954468,
"learning_rate": 7.13771899654596e-05,
"loss": 2.3617,
"step": 1463
},
{
"epoch": 0.36790852547590625,
"grad_norm": 0.5242950320243835,
"learning_rate": 7.134104184133856e-05,
"loss": 2.0933,
"step": 1464
},
{
"epoch": 0.3681598291135264,
"grad_norm": 0.3808387517929077,
"learning_rate": 7.130488007291683e-05,
"loss": 1.905,
"step": 1465
},
{
"epoch": 0.36841113275114656,
"grad_norm": 0.33748942613601685,
"learning_rate": 7.126870468331426e-05,
"loss": 2.2044,
"step": 1466
},
{
"epoch": 0.3686624363887667,
"grad_norm": 0.37744027376174927,
"learning_rate": 7.123251569565943e-05,
"loss": 1.9647,
"step": 1467
},
{
"epoch": 0.3689137400263869,
"grad_norm": 0.43732622265815735,
"learning_rate": 7.119631313308964e-05,
"loss": 2.2956,
"step": 1468
},
{
"epoch": 0.36916504366400704,
"grad_norm": 0.2148832082748413,
"learning_rate": 7.116009701875078e-05,
"loss": 2.0276,
"step": 1469
},
{
"epoch": 0.3694163473016272,
"grad_norm": 0.3715471029281616,
"learning_rate": 7.112386737579752e-05,
"loss": 2.5056,
"step": 1470
},
{
"epoch": 0.36966765093924736,
"grad_norm": 0.34628719091415405,
"learning_rate": 7.108762422739309e-05,
"loss": 2.0201,
"step": 1471
},
{
"epoch": 0.3699189545768675,
"grad_norm": 0.40477246046066284,
"learning_rate": 7.10513675967094e-05,
"loss": 2.1746,
"step": 1472
},
{
"epoch": 0.3701702582144877,
"grad_norm": 0.46029016375541687,
"learning_rate": 7.101509750692695e-05,
"loss": 2.0847,
"step": 1473
},
{
"epoch": 0.3704215618521078,
"grad_norm": 0.45498889684677124,
"learning_rate": 7.097881398123488e-05,
"loss": 1.9687,
"step": 1474
},
{
"epoch": 0.37067286548972794,
"grad_norm": 0.46583226323127747,
"learning_rate": 7.094251704283089e-05,
"loss": 2.5303,
"step": 1475
},
{
"epoch": 0.3709241691273481,
"grad_norm": 0.2552844285964966,
"learning_rate": 7.090620671492128e-05,
"loss": 2.5519,
"step": 1476
},
{
"epoch": 0.37117547276496826,
"grad_norm": 0.14888420701026917,
"learning_rate": 7.086988302072089e-05,
"loss": 0.7764,
"step": 1477
},
{
"epoch": 0.3714267764025884,
"grad_norm": 0.3378463685512543,
"learning_rate": 7.08335459834531e-05,
"loss": 2.0823,
"step": 1478
},
{
"epoch": 0.3716780800402086,
"grad_norm": 0.46584033966064453,
"learning_rate": 7.079719562634986e-05,
"loss": 2.3412,
"step": 1479
},
{
"epoch": 0.37192938367782874,
"grad_norm": 0.34758704900741577,
"learning_rate": 7.076083197265157e-05,
"loss": 2.1768,
"step": 1480
},
{
"epoch": 0.3721806873154489,
"grad_norm": 0.40054845809936523,
"learning_rate": 7.072445504560722e-05,
"loss": 1.9939,
"step": 1481
},
{
"epoch": 0.37243199095306906,
"grad_norm": 0.29733559489250183,
"learning_rate": 7.068806486847421e-05,
"loss": 1.9872,
"step": 1482
},
{
"epoch": 0.3726832945906892,
"grad_norm": 0.3448224365711212,
"learning_rate": 7.065166146451844e-05,
"loss": 1.9621,
"step": 1483
},
{
"epoch": 0.3729345982283094,
"grad_norm": 0.3913850784301758,
"learning_rate": 7.061524485701428e-05,
"loss": 2.0991,
"step": 1484
},
{
"epoch": 0.37318590186592954,
"grad_norm": 0.34898319840431213,
"learning_rate": 7.057881506924448e-05,
"loss": 2.3239,
"step": 1485
},
{
"epoch": 0.37343720550354964,
"grad_norm": 0.40906834602355957,
"learning_rate": 7.054237212450034e-05,
"loss": 2.214,
"step": 1486
},
{
"epoch": 0.3736885091411698,
"grad_norm": 0.2804180979728699,
"learning_rate": 7.050591604608143e-05,
"loss": 2.3497,
"step": 1487
},
{
"epoch": 0.37393981277878996,
"grad_norm": 0.5161413550376892,
"learning_rate": 7.046944685729581e-05,
"loss": 2.0113,
"step": 1488
},
{
"epoch": 0.3741911164164101,
"grad_norm": 0.42594465613365173,
"learning_rate": 7.04329645814599e-05,
"loss": 2.2473,
"step": 1489
},
{
"epoch": 0.3744424200540303,
"grad_norm": 0.28369376063346863,
"learning_rate": 7.039646924189848e-05,
"loss": 2.5277,
"step": 1490
},
{
"epoch": 0.37469372369165044,
"grad_norm": 0.3760066330432892,
"learning_rate": 7.035996086194467e-05,
"loss": 2.5726,
"step": 1491
},
{
"epoch": 0.3749450273292706,
"grad_norm": 0.8721727132797241,
"learning_rate": 7.032343946493993e-05,
"loss": 2.1684,
"step": 1492
},
{
"epoch": 0.37519633096689076,
"grad_norm": 0.3655981123447418,
"learning_rate": 7.02869050742341e-05,
"loss": 2.2741,
"step": 1493
},
{
"epoch": 0.3754476346045109,
"grad_norm": 0.2971070408821106,
"learning_rate": 7.025035771318524e-05,
"loss": 2.0898,
"step": 1494
},
{
"epoch": 0.3756989382421311,
"grad_norm": 0.25091758370399475,
"learning_rate": 7.021379740515977e-05,
"loss": 1.8725,
"step": 1495
},
{
"epoch": 0.37595024187975123,
"grad_norm": 0.49658530950546265,
"learning_rate": 7.017722417353233e-05,
"loss": 2.4947,
"step": 1496
},
{
"epoch": 0.37620154551737134,
"grad_norm": 0.607513427734375,
"learning_rate": 7.014063804168587e-05,
"loss": 1.5828,
"step": 1497
},
{
"epoch": 0.3764528491549915,
"grad_norm": 0.22139927744865417,
"learning_rate": 7.010403903301158e-05,
"loss": 2.1792,
"step": 1498
},
{
"epoch": 0.37670415279261166,
"grad_norm": 0.670466959476471,
"learning_rate": 7.006742717090887e-05,
"loss": 2.6152,
"step": 1499
},
{
"epoch": 0.3769554564302318,
"grad_norm": 0.27696824073791504,
"learning_rate": 7.003080247878537e-05,
"loss": 2.2613,
"step": 1500
},
{
"epoch": 0.377206760067852,
"grad_norm": 0.5103288888931274,
"learning_rate": 6.99941649800569e-05,
"loss": 2.3938,
"step": 1501
},
{
"epoch": 0.37745806370547214,
"grad_norm": 0.38460054993629456,
"learning_rate": 6.995751469814751e-05,
"loss": 2.5157,
"step": 1502
},
{
"epoch": 0.3777093673430923,
"grad_norm": 0.46227124333381653,
"learning_rate": 6.992085165648939e-05,
"loss": 2.2615,
"step": 1503
},
{
"epoch": 0.37796067098071245,
"grad_norm": 0.4528365433216095,
"learning_rate": 6.988417587852287e-05,
"loss": 2.4324,
"step": 1504
},
{
"epoch": 0.3782119746183326,
"grad_norm": 0.172061488032341,
"learning_rate": 6.984748738769647e-05,
"loss": 1.8319,
"step": 1505
},
{
"epoch": 0.3784632782559528,
"grad_norm": 0.3058612048625946,
"learning_rate": 6.98107862074668e-05,
"loss": 2.1932,
"step": 1506
},
{
"epoch": 0.37871458189357293,
"grad_norm": 0.279033362865448,
"learning_rate": 6.977407236129862e-05,
"loss": 2.5174,
"step": 1507
},
{
"epoch": 0.37896588553119304,
"grad_norm": 0.31402352452278137,
"learning_rate": 6.973734587266474e-05,
"loss": 2.623,
"step": 1508
},
{
"epoch": 0.3792171891688132,
"grad_norm": 0.18773365020751953,
"learning_rate": 6.970060676504608e-05,
"loss": 1.8999,
"step": 1509
},
{
"epoch": 0.37946849280643336,
"grad_norm": 0.39459121227264404,
"learning_rate": 6.966385506193165e-05,
"loss": 2.1866,
"step": 1510
},
{
"epoch": 0.3797197964440535,
"grad_norm": 0.4056554138660431,
"learning_rate": 6.962709078681845e-05,
"loss": 2.512,
"step": 1511
},
{
"epoch": 0.3799711000816737,
"grad_norm": 0.3101893961429596,
"learning_rate": 6.95903139632116e-05,
"loss": 2.4238,
"step": 1512
},
{
"epoch": 0.38022240371929383,
"grad_norm": 0.4230501353740692,
"learning_rate": 6.955352461462418e-05,
"loss": 2.1624,
"step": 1513
},
{
"epoch": 0.380473707356914,
"grad_norm": 0.4053939878940582,
"learning_rate": 6.951672276457728e-05,
"loss": 2.0436,
"step": 1514
},
{
"epoch": 0.38072501099453415,
"grad_norm": 0.3917832672595978,
"learning_rate": 6.947990843660002e-05,
"loss": 2.6129,
"step": 1515
},
{
"epoch": 0.3809763146321543,
"grad_norm": 0.3876727819442749,
"learning_rate": 6.944308165422949e-05,
"loss": 2.2111,
"step": 1516
},
{
"epoch": 0.38122761826977447,
"grad_norm": 0.49222511053085327,
"learning_rate": 6.94062424410107e-05,
"loss": 2.0696,
"step": 1517
},
{
"epoch": 0.38147892190739463,
"grad_norm": 0.7491002678871155,
"learning_rate": 6.936939082049664e-05,
"loss": 1.8861,
"step": 1518
},
{
"epoch": 0.3817302255450148,
"grad_norm": 0.3003447949886322,
"learning_rate": 6.933252681624825e-05,
"loss": 2.344,
"step": 1519
},
{
"epoch": 0.3819815291826349,
"grad_norm": 0.4402701258659363,
"learning_rate": 6.929565045183438e-05,
"loss": 2.1229,
"step": 1520
},
{
"epoch": 0.38223283282025505,
"grad_norm": 0.2871621251106262,
"learning_rate": 6.925876175083174e-05,
"loss": 1.8697,
"step": 1521
},
{
"epoch": 0.3824841364578752,
"grad_norm": 0.3502472937107086,
"learning_rate": 6.922186073682496e-05,
"loss": 2.2823,
"step": 1522
},
{
"epoch": 0.38273544009549537,
"grad_norm": 0.13275161385536194,
"learning_rate": 6.918494743340656e-05,
"loss": 0.4535,
"step": 1523
},
{
"epoch": 0.38298674373311553,
"grad_norm": 0.18874838948249817,
"learning_rate": 6.914802186417689e-05,
"loss": 2.1222,
"step": 1524
},
{
"epoch": 0.3832380473707357,
"grad_norm": 0.2585754096508026,
"learning_rate": 6.911108405274412e-05,
"loss": 2.2653,
"step": 1525
},
{
"epoch": 0.38348935100835585,
"grad_norm": 0.36807018518447876,
"learning_rate": 6.907413402272431e-05,
"loss": 2.3976,
"step": 1526
},
{
"epoch": 0.383740654645976,
"grad_norm": 0.5914536118507385,
"learning_rate": 6.90371717977413e-05,
"loss": 1.9575,
"step": 1527
},
{
"epoch": 0.38399195828359617,
"grad_norm": 0.4083138108253479,
"learning_rate": 6.900019740142672e-05,
"loss": 2.2095,
"step": 1528
},
{
"epoch": 0.38424326192121633,
"grad_norm": 0.33133891224861145,
"learning_rate": 6.896321085741997e-05,
"loss": 2.2339,
"step": 1529
},
{
"epoch": 0.3844945655588365,
"grad_norm": 0.26224732398986816,
"learning_rate": 6.892621218936825e-05,
"loss": 2.3765,
"step": 1530
},
{
"epoch": 0.3847458691964566,
"grad_norm": 0.3711860179901123,
"learning_rate": 6.888920142092647e-05,
"loss": 2.4986,
"step": 1531
},
{
"epoch": 0.38499717283407675,
"grad_norm": 0.1522902250289917,
"learning_rate": 6.885217857575734e-05,
"loss": 0.7433,
"step": 1532
},
{
"epoch": 0.3852484764716969,
"grad_norm": 0.2632433772087097,
"learning_rate": 6.881514367753124e-05,
"loss": 1.6683,
"step": 1533
},
{
"epoch": 0.38549978010931707,
"grad_norm": 0.27296924591064453,
"learning_rate": 6.877809674992625e-05,
"loss": 2.2339,
"step": 1534
},
{
"epoch": 0.38575108374693723,
"grad_norm": 0.3882145285606384,
"learning_rate": 6.874103781662818e-05,
"loss": 2.2982,
"step": 1535
},
{
"epoch": 0.3860023873845574,
"grad_norm": 0.35082605481147766,
"learning_rate": 6.870396690133047e-05,
"loss": 2.2364,
"step": 1536
},
{
"epoch": 0.38625369102217755,
"grad_norm": 0.4729946255683899,
"learning_rate": 6.866688402773427e-05,
"loss": 2.3253,
"step": 1537
},
{
"epoch": 0.3865049946597977,
"grad_norm": 0.42705488204956055,
"learning_rate": 6.862978921954835e-05,
"loss": 2.4093,
"step": 1538
},
{
"epoch": 0.38675629829741787,
"grad_norm": 0.17129021883010864,
"learning_rate": 6.859268250048909e-05,
"loss": 0.8209,
"step": 1539
},
{
"epoch": 0.387007601935038,
"grad_norm": 0.30329659581184387,
"learning_rate": 6.855556389428052e-05,
"loss": 2.3676,
"step": 1540
},
{
"epoch": 0.3872589055726582,
"grad_norm": 0.26523396372795105,
"learning_rate": 6.851843342465428e-05,
"loss": 2.3319,
"step": 1541
},
{
"epoch": 0.38751020921027834,
"grad_norm": 0.32253608107566833,
"learning_rate": 6.848129111534953e-05,
"loss": 2.4377,
"step": 1542
},
{
"epoch": 0.38776151284789845,
"grad_norm": 0.4546685516834259,
"learning_rate": 6.844413699011306e-05,
"loss": 2.1457,
"step": 1543
},
{
"epoch": 0.3880128164855186,
"grad_norm": 0.7805173993110657,
"learning_rate": 6.84069710726992e-05,
"loss": 2.0873,
"step": 1544
},
{
"epoch": 0.38826412012313877,
"grad_norm": 0.46392345428466797,
"learning_rate": 6.836979338686981e-05,
"loss": 1.9017,
"step": 1545
},
{
"epoch": 0.3885154237607589,
"grad_norm": 0.39228349924087524,
"learning_rate": 6.833260395639429e-05,
"loss": 2.2843,
"step": 1546
},
{
"epoch": 0.3887667273983791,
"grad_norm": 0.42995691299438477,
"learning_rate": 6.829540280504951e-05,
"loss": 2.8123,
"step": 1547
},
{
"epoch": 0.38901803103599925,
"grad_norm": 0.4211295247077942,
"learning_rate": 6.82581899566199e-05,
"loss": 1.995,
"step": 1548
},
{
"epoch": 0.3892693346736194,
"grad_norm": 0.43150249123573303,
"learning_rate": 6.822096543489729e-05,
"loss": 2.0724,
"step": 1549
},
{
"epoch": 0.38952063831123956,
"grad_norm": 0.35338032245635986,
"learning_rate": 6.818372926368104e-05,
"loss": 2.0811,
"step": 1550
},
{
"epoch": 0.3897719419488597,
"grad_norm": 0.41222333908081055,
"learning_rate": 6.814648146677793e-05,
"loss": 2.3013,
"step": 1551
},
{
"epoch": 0.3900232455864799,
"grad_norm": 0.4232124984264374,
"learning_rate": 6.810922206800215e-05,
"loss": 2.6888,
"step": 1552
},
{
"epoch": 0.39027454922410004,
"grad_norm": 0.40635040402412415,
"learning_rate": 6.807195109117537e-05,
"loss": 2.409,
"step": 1553
},
{
"epoch": 0.39052585286172015,
"grad_norm": 0.5073560476303101,
"learning_rate": 6.80346685601266e-05,
"loss": 2.5831,
"step": 1554
},
{
"epoch": 0.3907771564993403,
"grad_norm": 0.3237619400024414,
"learning_rate": 6.799737449869226e-05,
"loss": 1.837,
"step": 1555
},
{
"epoch": 0.39102846013696047,
"grad_norm": 0.31155863404273987,
"learning_rate": 6.796006893071615e-05,
"loss": 2.1885,
"step": 1556
},
{
"epoch": 0.3912797637745806,
"grad_norm": 0.5250048637390137,
"learning_rate": 6.792275188004942e-05,
"loss": 2.3971,
"step": 1557
},
{
"epoch": 0.3915310674122008,
"grad_norm": 0.3368058502674103,
"learning_rate": 6.788542337055055e-05,
"loss": 2.3448,
"step": 1558
},
{
"epoch": 0.39178237104982094,
"grad_norm": 0.40310317277908325,
"learning_rate": 6.784808342608537e-05,
"loss": 2.1963,
"step": 1559
},
{
"epoch": 0.3920336746874411,
"grad_norm": 0.34813496470451355,
"learning_rate": 6.781073207052702e-05,
"loss": 2.0816,
"step": 1560
},
{
"epoch": 0.39228497832506126,
"grad_norm": 0.4611975848674774,
"learning_rate": 6.777336932775588e-05,
"loss": 2.4709,
"step": 1561
},
{
"epoch": 0.3925362819626814,
"grad_norm": 0.45631229877471924,
"learning_rate": 6.773599522165971e-05,
"loss": 2.2422,
"step": 1562
},
{
"epoch": 0.3927875856003016,
"grad_norm": 0.3720869719982147,
"learning_rate": 6.769860977613345e-05,
"loss": 1.9893,
"step": 1563
},
{
"epoch": 0.39303888923792174,
"grad_norm": 0.36420589685440063,
"learning_rate": 6.76612130150793e-05,
"loss": 2.4629,
"step": 1564
},
{
"epoch": 0.3932901928755419,
"grad_norm": 0.4226488173007965,
"learning_rate": 6.762380496240675e-05,
"loss": 2.4334,
"step": 1565
},
{
"epoch": 0.393541496513162,
"grad_norm": 0.4091019928455353,
"learning_rate": 6.758638564203245e-05,
"loss": 2.3821,
"step": 1566
},
{
"epoch": 0.39379280015078216,
"grad_norm": 0.24210424721240997,
"learning_rate": 6.75489550778803e-05,
"loss": 0.9144,
"step": 1567
},
{
"epoch": 0.3940441037884023,
"grad_norm": 0.23522338271141052,
"learning_rate": 6.751151329388136e-05,
"loss": 1.7405,
"step": 1568
},
{
"epoch": 0.3942954074260225,
"grad_norm": 0.15203450620174408,
"learning_rate": 6.747406031397384e-05,
"loss": 0.7777,
"step": 1569
},
{
"epoch": 0.39454671106364264,
"grad_norm": 0.22364236414432526,
"learning_rate": 6.743659616210316e-05,
"loss": 2.0508,
"step": 1570
},
{
"epoch": 0.3947980147012628,
"grad_norm": 0.3944762945175171,
"learning_rate": 6.739912086222187e-05,
"loss": 1.9432,
"step": 1571
},
{
"epoch": 0.39504931833888296,
"grad_norm": 0.46476662158966064,
"learning_rate": 6.736163443828962e-05,
"loss": 1.9567,
"step": 1572
},
{
"epoch": 0.3953006219765031,
"grad_norm": 0.4436105489730835,
"learning_rate": 6.732413691427318e-05,
"loss": 1.9792,
"step": 1573
},
{
"epoch": 0.3955519256141233,
"grad_norm": 0.3982658088207245,
"learning_rate": 6.728662831414646e-05,
"loss": 1.8666,
"step": 1574
},
{
"epoch": 0.39580322925174344,
"grad_norm": 0.30122610926628113,
"learning_rate": 6.724910866189038e-05,
"loss": 2.4439,
"step": 1575
},
{
"epoch": 0.3960545328893636,
"grad_norm": 0.3222323954105377,
"learning_rate": 6.7211577981493e-05,
"loss": 2.4668,
"step": 1576
},
{
"epoch": 0.3963058365269837,
"grad_norm": 0.5659331679344177,
"learning_rate": 6.717403629694936e-05,
"loss": 2.482,
"step": 1577
},
{
"epoch": 0.39655714016460386,
"grad_norm": 0.2780524492263794,
"learning_rate": 6.713648363226159e-05,
"loss": 1.9637,
"step": 1578
},
{
"epoch": 0.396808443802224,
"grad_norm": 0.34126976132392883,
"learning_rate": 6.709892001143882e-05,
"loss": 0.7672,
"step": 1579
},
{
"epoch": 0.3970597474398442,
"grad_norm": 0.4123145639896393,
"learning_rate": 6.706134545849718e-05,
"loss": 2.1787,
"step": 1580
},
{
"epoch": 0.39731105107746434,
"grad_norm": 0.45804792642593384,
"learning_rate": 6.702375999745979e-05,
"loss": 2.5832,
"step": 1581
},
{
"epoch": 0.3975623547150845,
"grad_norm": 0.3806130886077881,
"learning_rate": 6.698616365235676e-05,
"loss": 2.0183,
"step": 1582
},
{
"epoch": 0.39781365835270466,
"grad_norm": 0.4113427996635437,
"learning_rate": 6.694855644722513e-05,
"loss": 2.3072,
"step": 1583
},
{
"epoch": 0.3980649619903248,
"grad_norm": 0.43049007654190063,
"learning_rate": 6.691093840610892e-05,
"loss": 2.1318,
"step": 1584
},
{
"epoch": 0.398316265627945,
"grad_norm": 0.5058295726776123,
"learning_rate": 6.687330955305904e-05,
"loss": 2.3739,
"step": 1585
},
{
"epoch": 0.39856756926556514,
"grad_norm": 0.3820212781429291,
"learning_rate": 6.683566991213334e-05,
"loss": 2.4582,
"step": 1586
},
{
"epoch": 0.3988188729031853,
"grad_norm": 0.6381146907806396,
"learning_rate": 6.679801950739655e-05,
"loss": 2.6437,
"step": 1587
},
{
"epoch": 0.39907017654080545,
"grad_norm": 0.26425233483314514,
"learning_rate": 6.67603583629203e-05,
"loss": 1.7483,
"step": 1588
},
{
"epoch": 0.39932148017842556,
"grad_norm": 0.1781735122203827,
"learning_rate": 6.672268650278308e-05,
"loss": 0.692,
"step": 1589
},
{
"epoch": 0.3995727838160457,
"grad_norm": 0.22695676982402802,
"learning_rate": 6.668500395107023e-05,
"loss": 1.9322,
"step": 1590
},
{
"epoch": 0.3998240874536659,
"grad_norm": 0.4311312735080719,
"learning_rate": 6.66473107318739e-05,
"loss": 2.4121,
"step": 1591
},
{
"epoch": 0.40007539109128604,
"grad_norm": 0.3456994593143463,
"learning_rate": 6.660960686929308e-05,
"loss": 2.0809,
"step": 1592
},
{
"epoch": 0.4003266947289062,
"grad_norm": 0.3440932333469391,
"learning_rate": 6.65718923874336e-05,
"loss": 2.5492,
"step": 1593
},
{
"epoch": 0.40057799836652636,
"grad_norm": 0.486544668674469,
"learning_rate": 6.653416731040802e-05,
"loss": 2.3515,
"step": 1594
},
{
"epoch": 0.4008293020041465,
"grad_norm": 0.48320162296295166,
"learning_rate": 6.649643166233573e-05,
"loss": 1.9959,
"step": 1595
},
{
"epoch": 0.4010806056417667,
"grad_norm": 0.5600380897521973,
"learning_rate": 6.645868546734282e-05,
"loss": 1.6978,
"step": 1596
},
{
"epoch": 0.40133190927938683,
"grad_norm": 0.41357421875,
"learning_rate": 6.642092874956217e-05,
"loss": 2.1349,
"step": 1597
},
{
"epoch": 0.401583212917007,
"grad_norm": 0.4483415186405182,
"learning_rate": 6.638316153313335e-05,
"loss": 2.0338,
"step": 1598
},
{
"epoch": 0.40183451655462715,
"grad_norm": 0.24149690568447113,
"learning_rate": 6.634538384220268e-05,
"loss": 2.2433,
"step": 1599
},
{
"epoch": 0.40208582019224726,
"grad_norm": 0.3381589949131012,
"learning_rate": 6.630759570092317e-05,
"loss": 2.0402,
"step": 1600
},
{
"epoch": 0.4023371238298674,
"grad_norm": 0.25250038504600525,
"learning_rate": 6.626979713345448e-05,
"loss": 2.2019,
"step": 1601
},
{
"epoch": 0.4025884274674876,
"grad_norm": 0.31296542286872864,
"learning_rate": 6.623198816396297e-05,
"loss": 2.4589,
"step": 1602
},
{
"epoch": 0.40283973110510773,
"grad_norm": 0.36123234033584595,
"learning_rate": 6.619416881662162e-05,
"loss": 2.0428,
"step": 1603
},
{
"epoch": 0.4030910347427279,
"grad_norm": 0.3003954589366913,
"learning_rate": 6.615633911561007e-05,
"loss": 2.1773,
"step": 1604
},
{
"epoch": 0.40334233838034805,
"grad_norm": 0.4871181845664978,
"learning_rate": 6.61184990851146e-05,
"loss": 2.0408,
"step": 1605
},
{
"epoch": 0.4035936420179682,
"grad_norm": 0.4325210452079773,
"learning_rate": 6.608064874932804e-05,
"loss": 2.5123,
"step": 1606
},
{
"epoch": 0.40384494565558837,
"grad_norm": 0.3475637137889862,
"learning_rate": 6.604278813244982e-05,
"loss": 2.1961,
"step": 1607
},
{
"epoch": 0.40409624929320853,
"grad_norm": 0.3930399715900421,
"learning_rate": 6.600491725868599e-05,
"loss": 1.8388,
"step": 1608
},
{
"epoch": 0.4043475529308287,
"grad_norm": 0.25549179315567017,
"learning_rate": 6.59670361522491e-05,
"loss": 1.1064,
"step": 1609
},
{
"epoch": 0.40459885656844885,
"grad_norm": 0.3450826406478882,
"learning_rate": 6.592914483735828e-05,
"loss": 2.1311,
"step": 1610
},
{
"epoch": 0.404850160206069,
"grad_norm": 0.32824480533599854,
"learning_rate": 6.589124333823918e-05,
"loss": 2.0151,
"step": 1611
},
{
"epoch": 0.4051014638436891,
"grad_norm": 0.5576545596122742,
"learning_rate": 6.585333167912394e-05,
"loss": 1.882,
"step": 1612
},
{
"epoch": 0.4053527674813093,
"grad_norm": 0.43908435106277466,
"learning_rate": 6.581540988425123e-05,
"loss": 1.7746,
"step": 1613
},
{
"epoch": 0.40560407111892943,
"grad_norm": 0.5368839502334595,
"learning_rate": 6.577747797786617e-05,
"loss": 2.3111,
"step": 1614
},
{
"epoch": 0.4058553747565496,
"grad_norm": 0.32215848565101624,
"learning_rate": 6.573953598422036e-05,
"loss": 2.2492,
"step": 1615
},
{
"epoch": 0.40610667839416975,
"grad_norm": 0.18906524777412415,
"learning_rate": 6.570158392757184e-05,
"loss": 1.7845,
"step": 1616
},
{
"epoch": 0.4063579820317899,
"grad_norm": 0.522850751876831,
"learning_rate": 6.566362183218511e-05,
"loss": 1.9706,
"step": 1617
},
{
"epoch": 0.40660928566941007,
"grad_norm": 0.5646048784255981,
"learning_rate": 6.562564972233103e-05,
"loss": 2.1804,
"step": 1618
},
{
"epoch": 0.40686058930703023,
"grad_norm": 0.41810548305511475,
"learning_rate": 6.558766762228693e-05,
"loss": 2.2449,
"step": 1619
},
{
"epoch": 0.4071118929446504,
"grad_norm": 0.4327705204486847,
"learning_rate": 6.554967555633649e-05,
"loss": 2.6681,
"step": 1620
},
{
"epoch": 0.40736319658227055,
"grad_norm": 0.3351784944534302,
"learning_rate": 6.551167354876977e-05,
"loss": 2.2619,
"step": 1621
},
{
"epoch": 0.4076145002198907,
"grad_norm": 0.31251177191734314,
"learning_rate": 6.547366162388319e-05,
"loss": 2.1284,
"step": 1622
},
{
"epoch": 0.4078658038575108,
"grad_norm": 0.20730093121528625,
"learning_rate": 6.543563980597949e-05,
"loss": 1.2999,
"step": 1623
},
{
"epoch": 0.40811710749513097,
"grad_norm": 0.4779922664165497,
"learning_rate": 6.539760811936777e-05,
"loss": 2.5214,
"step": 1624
},
{
"epoch": 0.40836841113275113,
"grad_norm": 0.38662630319595337,
"learning_rate": 6.535956658836341e-05,
"loss": 2.46,
"step": 1625
},
{
"epoch": 0.4086197147703713,
"grad_norm": 0.2927972674369812,
"learning_rate": 6.53215152372881e-05,
"loss": 2.3501,
"step": 1626
},
{
"epoch": 0.40887101840799145,
"grad_norm": 0.4462030231952667,
"learning_rate": 6.52834540904698e-05,
"loss": 2.0303,
"step": 1627
},
{
"epoch": 0.4091223220456116,
"grad_norm": 0.3924959599971771,
"learning_rate": 6.524538317224273e-05,
"loss": 2.2702,
"step": 1628
},
{
"epoch": 0.40937362568323177,
"grad_norm": 0.34013834595680237,
"learning_rate": 6.520730250694738e-05,
"loss": 1.6836,
"step": 1629
},
{
"epoch": 0.4096249293208519,
"grad_norm": 0.42472806572914124,
"learning_rate": 6.516921211893041e-05,
"loss": 2.0513,
"step": 1630
},
{
"epoch": 0.4098762329584721,
"grad_norm": 0.4425092935562134,
"learning_rate": 6.51311120325448e-05,
"loss": 2.092,
"step": 1631
},
{
"epoch": 0.41012753659609225,
"grad_norm": 0.41965746879577637,
"learning_rate": 6.509300227214965e-05,
"loss": 2.5082,
"step": 1632
},
{
"epoch": 0.4103788402337124,
"grad_norm": 0.41122114658355713,
"learning_rate": 6.505488286211027e-05,
"loss": 2.2883,
"step": 1633
},
{
"epoch": 0.41063014387133256,
"grad_norm": 0.2704785466194153,
"learning_rate": 6.501675382679812e-05,
"loss": 2.2976,
"step": 1634
},
{
"epoch": 0.41088144750895267,
"grad_norm": 0.4489680230617523,
"learning_rate": 6.497861519059084e-05,
"loss": 2.1877,
"step": 1635
},
{
"epoch": 0.41113275114657283,
"grad_norm": 0.4025135934352875,
"learning_rate": 6.49404669778722e-05,
"loss": 2.1432,
"step": 1636
},
{
"epoch": 0.411384054784193,
"grad_norm": 0.31426647305488586,
"learning_rate": 6.490230921303209e-05,
"loss": 2.3581,
"step": 1637
},
{
"epoch": 0.41163535842181315,
"grad_norm": 0.4263187348842621,
"learning_rate": 6.48641419204665e-05,
"loss": 2.0292,
"step": 1638
},
{
"epoch": 0.4118866620594333,
"grad_norm": 0.19534143805503845,
"learning_rate": 6.482596512457754e-05,
"loss": 1.0616,
"step": 1639
},
{
"epoch": 0.41213796569705347,
"grad_norm": 0.1499776840209961,
"learning_rate": 6.478777884977335e-05,
"loss": 0.5066,
"step": 1640
},
{
"epoch": 0.4123892693346736,
"grad_norm": 0.2703084647655487,
"learning_rate": 6.474958312046817e-05,
"loss": 2.4124,
"step": 1641
},
{
"epoch": 0.4126405729722938,
"grad_norm": 0.48154720664024353,
"learning_rate": 6.471137796108227e-05,
"loss": 2.4563,
"step": 1642
},
{
"epoch": 0.41289187660991394,
"grad_norm": 0.6079049706459045,
"learning_rate": 6.467316339604197e-05,
"loss": 2.4903,
"step": 1643
},
{
"epoch": 0.4131431802475341,
"grad_norm": 0.7089735865592957,
"learning_rate": 6.463493944977954e-05,
"loss": 2.7061,
"step": 1644
},
{
"epoch": 0.41339448388515426,
"grad_norm": 0.48337823152542114,
"learning_rate": 6.459670614673332e-05,
"loss": 1.8887,
"step": 1645
},
{
"epoch": 0.41364578752277437,
"grad_norm": 0.2442682683467865,
"learning_rate": 6.455846351134759e-05,
"loss": 2.4887,
"step": 1646
},
{
"epoch": 0.4138970911603945,
"grad_norm": 0.42603641748428345,
"learning_rate": 6.452021156807262e-05,
"loss": 2.0719,
"step": 1647
},
{
"epoch": 0.4141483947980147,
"grad_norm": 0.2770818769931793,
"learning_rate": 6.448195034136461e-05,
"loss": 1.7527,
"step": 1648
},
{
"epoch": 0.41439969843563484,
"grad_norm": 0.3582463562488556,
"learning_rate": 6.444367985568571e-05,
"loss": 2.3942,
"step": 1649
},
{
"epoch": 0.414651002073255,
"grad_norm": 0.31034761667251587,
"learning_rate": 6.4405400135504e-05,
"loss": 2.3178,
"step": 1650
},
{
"epoch": 0.41490230571087516,
"grad_norm": 0.19327189028263092,
"learning_rate": 6.436711120529343e-05,
"loss": 1.4941,
"step": 1651
},
{
"epoch": 0.4151536093484953,
"grad_norm": 0.47710877656936646,
"learning_rate": 6.432881308953386e-05,
"loss": 1.9341,
"step": 1652
},
{
"epoch": 0.4154049129861155,
"grad_norm": 0.44465112686157227,
"learning_rate": 6.429050581271105e-05,
"loss": 2.5935,
"step": 1653
},
{
"epoch": 0.41565621662373564,
"grad_norm": 0.5776760578155518,
"learning_rate": 6.425218939931654e-05,
"loss": 2.3612,
"step": 1654
},
{
"epoch": 0.4159075202613558,
"grad_norm": 0.14499108493328094,
"learning_rate": 6.42138638738478e-05,
"loss": 1.0138,
"step": 1655
},
{
"epoch": 0.41615882389897596,
"grad_norm": 0.4578950107097626,
"learning_rate": 6.417552926080806e-05,
"loss": 2.4257,
"step": 1656
},
{
"epoch": 0.4164101275365961,
"grad_norm": 0.4383941888809204,
"learning_rate": 6.413718558470639e-05,
"loss": 2.303,
"step": 1657
},
{
"epoch": 0.4166614311742162,
"grad_norm": 0.24252034723758698,
"learning_rate": 6.409883287005767e-05,
"loss": 2.3268,
"step": 1658
},
{
"epoch": 0.4169127348118364,
"grad_norm": 0.6126996874809265,
"learning_rate": 6.406047114138252e-05,
"loss": 2.0916,
"step": 1659
},
{
"epoch": 0.41716403844945654,
"grad_norm": 0.49406373500823975,
"learning_rate": 6.402210042320733e-05,
"loss": 1.8098,
"step": 1660
},
{
"epoch": 0.4174153420870767,
"grad_norm": 0.4942653179168701,
"learning_rate": 6.398372074006428e-05,
"loss": 1.6953,
"step": 1661
},
{
"epoch": 0.41766664572469686,
"grad_norm": 0.2597447335720062,
"learning_rate": 6.394533211649122e-05,
"loss": 2.0503,
"step": 1662
},
{
"epoch": 0.417917949362317,
"grad_norm": 0.4600328207015991,
"learning_rate": 6.390693457703177e-05,
"loss": 2.0682,
"step": 1663
},
{
"epoch": 0.4181692529999372,
"grad_norm": 0.3954939842224121,
"learning_rate": 6.386852814623522e-05,
"loss": 2.239,
"step": 1664
},
{
"epoch": 0.41842055663755734,
"grad_norm": 0.36097919940948486,
"learning_rate": 6.383011284865654e-05,
"loss": 2.4497,
"step": 1665
},
{
"epoch": 0.4186718602751775,
"grad_norm": 0.40833091735839844,
"learning_rate": 6.37916887088564e-05,
"loss": 2.2907,
"step": 1666
},
{
"epoch": 0.41892316391279766,
"grad_norm": 0.41030532121658325,
"learning_rate": 6.375325575140108e-05,
"loss": 2.2066,
"step": 1667
},
{
"epoch": 0.4191744675504178,
"grad_norm": 0.43152961134910583,
"learning_rate": 6.371481400086254e-05,
"loss": 2.0852,
"step": 1668
},
{
"epoch": 0.4194257711880379,
"grad_norm": 0.3646620810031891,
"learning_rate": 6.367636348181835e-05,
"loss": 1.9575,
"step": 1669
},
{
"epoch": 0.4196770748256581,
"grad_norm": 0.39278149604797363,
"learning_rate": 6.363790421885165e-05,
"loss": 2.0296,
"step": 1670
},
{
"epoch": 0.41992837846327824,
"grad_norm": 0.26738765835762024,
"learning_rate": 6.359943623655122e-05,
"loss": 2.2653,
"step": 1671
},
{
"epoch": 0.4201796821008984,
"grad_norm": 0.41789373755455017,
"learning_rate": 6.35609595595114e-05,
"loss": 2.3384,
"step": 1672
},
{
"epoch": 0.42043098573851856,
"grad_norm": 0.4324728846549988,
"learning_rate": 6.352247421233207e-05,
"loss": 1.8498,
"step": 1673
},
{
"epoch": 0.4206822893761387,
"grad_norm": 0.40609726309776306,
"learning_rate": 6.348398021961868e-05,
"loss": 2.4486,
"step": 1674
},
{
"epoch": 0.4209335930137589,
"grad_norm": 0.36156487464904785,
"learning_rate": 6.344547760598217e-05,
"loss": 2.2026,
"step": 1675
},
{
"epoch": 0.42118489665137904,
"grad_norm": 0.4976545572280884,
"learning_rate": 6.340696639603905e-05,
"loss": 2.1005,
"step": 1676
},
{
"epoch": 0.4214362002889992,
"grad_norm": 0.25209322571754456,
"learning_rate": 6.336844661441126e-05,
"loss": 1.9217,
"step": 1677
},
{
"epoch": 0.42168750392661936,
"grad_norm": 0.22021417319774628,
"learning_rate": 6.332991828572627e-05,
"loss": 1.8579,
"step": 1678
},
{
"epoch": 0.4219388075642395,
"grad_norm": 0.24054262042045593,
"learning_rate": 6.329138143461698e-05,
"loss": 1.9387,
"step": 1679
},
{
"epoch": 0.4221901112018597,
"grad_norm": 0.19225017726421356,
"learning_rate": 6.325283608572178e-05,
"loss": 1.4229,
"step": 1680
},
{
"epoch": 0.4224414148394798,
"grad_norm": 0.3207547962665558,
"learning_rate": 6.321428226368444e-05,
"loss": 2.6563,
"step": 1681
},
{
"epoch": 0.42269271847709994,
"grad_norm": 0.6084730625152588,
"learning_rate": 6.31757199931542e-05,
"loss": 2.1161,
"step": 1682
},
{
"epoch": 0.4229440221147201,
"grad_norm": 0.4410717487335205,
"learning_rate": 6.313714929878566e-05,
"loss": 2.3267,
"step": 1683
},
{
"epoch": 0.42319532575234026,
"grad_norm": 0.36312195658683777,
"learning_rate": 6.309857020523884e-05,
"loss": 2.1917,
"step": 1684
},
{
"epoch": 0.4234466293899604,
"grad_norm": 0.3438815176486969,
"learning_rate": 6.305998273717909e-05,
"loss": 2.3029,
"step": 1685
},
{
"epoch": 0.4236979330275806,
"grad_norm": 0.39790135622024536,
"learning_rate": 6.302138691927715e-05,
"loss": 2.2402,
"step": 1686
},
{
"epoch": 0.42394923666520073,
"grad_norm": 0.6604608297348022,
"learning_rate": 6.29827827762091e-05,
"loss": 2.0682,
"step": 1687
},
{
"epoch": 0.4242005403028209,
"grad_norm": 0.39813148975372314,
"learning_rate": 6.29441703326563e-05,
"loss": 2.1918,
"step": 1688
},
{
"epoch": 0.42445184394044105,
"grad_norm": 0.2750075161457062,
"learning_rate": 6.290554961330546e-05,
"loss": 2.3667,
"step": 1689
},
{
"epoch": 0.4247031475780612,
"grad_norm": 0.7254701852798462,
"learning_rate": 6.286692064284858e-05,
"loss": 2.1079,
"step": 1690
},
{
"epoch": 0.4249544512156814,
"grad_norm": 0.3623849153518677,
"learning_rate": 6.282828344598289e-05,
"loss": 2.2025,
"step": 1691
},
{
"epoch": 0.4252057548533015,
"grad_norm": 0.498189240694046,
"learning_rate": 6.278963804741092e-05,
"loss": 1.854,
"step": 1692
},
{
"epoch": 0.42545705849092164,
"grad_norm": 0.4776882231235504,
"learning_rate": 6.275098447184045e-05,
"loss": 1.931,
"step": 1693
},
{
"epoch": 0.4257083621285418,
"grad_norm": 0.3936712145805359,
"learning_rate": 6.271232274398447e-05,
"loss": 2.1005,
"step": 1694
},
{
"epoch": 0.42595966576616195,
"grad_norm": 0.3415871262550354,
"learning_rate": 6.267365288856117e-05,
"loss": 2.1024,
"step": 1695
},
{
"epoch": 0.4262109694037821,
"grad_norm": 0.40847861766815186,
"learning_rate": 6.263497493029396e-05,
"loss": 2.3391,
"step": 1696
},
{
"epoch": 0.4264622730414023,
"grad_norm": 0.38105782866477966,
"learning_rate": 6.259628889391142e-05,
"loss": 2.3127,
"step": 1697
},
{
"epoch": 0.42671357667902243,
"grad_norm": 0.5018170475959778,
"learning_rate": 6.255759480414732e-05,
"loss": 2.2537,
"step": 1698
},
{
"epoch": 0.4269648803166426,
"grad_norm": 0.2998669445514679,
"learning_rate": 6.25188926857405e-05,
"loss": 2.0985,
"step": 1699
},
{
"epoch": 0.42721618395426275,
"grad_norm": 0.17855164408683777,
"learning_rate": 6.248018256343504e-05,
"loss": 0.5861,
"step": 1700
},
{
"epoch": 0.4274674875918829,
"grad_norm": 0.3114035725593567,
"learning_rate": 6.244146446198006e-05,
"loss": 2.144,
"step": 1701
},
{
"epoch": 0.42771879122950307,
"grad_norm": 0.3799639940261841,
"learning_rate": 6.240273840612981e-05,
"loss": 2.4188,
"step": 1702
},
{
"epoch": 0.4279700948671232,
"grad_norm": 0.5280464291572571,
"learning_rate": 6.236400442064363e-05,
"loss": 2.1123,
"step": 1703
},
{
"epoch": 0.42822139850474333,
"grad_norm": 0.237023264169693,
"learning_rate": 6.232526253028593e-05,
"loss": 1.9309,
"step": 1704
},
{
"epoch": 0.4284727021423635,
"grad_norm": 0.3898535668849945,
"learning_rate": 6.228651275982614e-05,
"loss": 1.7865,
"step": 1705
},
{
"epoch": 0.42872400577998365,
"grad_norm": 2.0331013202667236,
"learning_rate": 6.224775513403877e-05,
"loss": 1.8481,
"step": 1706
},
{
"epoch": 0.4289753094176038,
"grad_norm": 0.3832615613937378,
"learning_rate": 6.220898967770334e-05,
"loss": 2.2707,
"step": 1707
},
{
"epoch": 0.42922661305522397,
"grad_norm": 0.3408937454223633,
"learning_rate": 6.217021641560436e-05,
"loss": 2.4584,
"step": 1708
},
{
"epoch": 0.42947791669284413,
"grad_norm": 0.49640515446662903,
"learning_rate": 6.213143537253134e-05,
"loss": 2.2002,
"step": 1709
},
{
"epoch": 0.4297292203304643,
"grad_norm": 0.33837461471557617,
"learning_rate": 6.20926465732788e-05,
"loss": 2.3568,
"step": 1710
},
{
"epoch": 0.42998052396808445,
"grad_norm": 0.5988655686378479,
"learning_rate": 6.205385004264616e-05,
"loss": 2.2734,
"step": 1711
},
{
"epoch": 0.4302318276057046,
"grad_norm": 0.5065860152244568,
"learning_rate": 6.201504580543782e-05,
"loss": 2.4239,
"step": 1712
},
{
"epoch": 0.43048313124332477,
"grad_norm": 0.5038727521896362,
"learning_rate": 6.197623388646308e-05,
"loss": 1.9147,
"step": 1713
},
{
"epoch": 0.4307344348809449,
"grad_norm": 0.3687746822834015,
"learning_rate": 6.19374143105362e-05,
"loss": 2.6965,
"step": 1714
},
{
"epoch": 0.43098573851856503,
"grad_norm": 0.39210835099220276,
"learning_rate": 6.189858710247629e-05,
"loss": 2.1168,
"step": 1715
},
{
"epoch": 0.4312370421561852,
"grad_norm": 0.5022109746932983,
"learning_rate": 6.185975228710734e-05,
"loss": 2.3949,
"step": 1716
},
{
"epoch": 0.43148834579380535,
"grad_norm": 0.43233370780944824,
"learning_rate": 6.182090988925823e-05,
"loss": 2.0664,
"step": 1717
},
{
"epoch": 0.4317396494314255,
"grad_norm": 0.2932543456554413,
"learning_rate": 6.178205993376268e-05,
"loss": 1.8359,
"step": 1718
},
{
"epoch": 0.43199095306904567,
"grad_norm": 0.2603437602519989,
"learning_rate": 6.174320244545922e-05,
"loss": 2.1517,
"step": 1719
},
{
"epoch": 0.43224225670666583,
"grad_norm": 0.398425817489624,
"learning_rate": 6.170433744919123e-05,
"loss": 1.9729,
"step": 1720
},
{
"epoch": 0.432493560344286,
"grad_norm": 0.44565537571907043,
"learning_rate": 6.166546496980688e-05,
"loss": 1.9102,
"step": 1721
},
{
"epoch": 0.43274486398190615,
"grad_norm": 0.38102757930755615,
"learning_rate": 6.16265850321591e-05,
"loss": 2.265,
"step": 1722
},
{
"epoch": 0.4329961676195263,
"grad_norm": 0.40929126739501953,
"learning_rate": 6.158769766110561e-05,
"loss": 2.4499,
"step": 1723
},
{
"epoch": 0.43324747125714647,
"grad_norm": 0.4028054475784302,
"learning_rate": 6.154880288150888e-05,
"loss": 2.4045,
"step": 1724
},
{
"epoch": 0.4334987748947666,
"grad_norm": 0.514401912689209,
"learning_rate": 6.15099007182361e-05,
"loss": 2.2548,
"step": 1725
},
{
"epoch": 0.43375007853238673,
"grad_norm": 0.42896950244903564,
"learning_rate": 6.147099119615923e-05,
"loss": 2.1909,
"step": 1726
},
{
"epoch": 0.4340013821700069,
"grad_norm": 0.3507866859436035,
"learning_rate": 6.143207434015487e-05,
"loss": 2.3633,
"step": 1727
},
{
"epoch": 0.43425268580762705,
"grad_norm": 0.26029667258262634,
"learning_rate": 6.139315017510437e-05,
"loss": 1.9713,
"step": 1728
},
{
"epoch": 0.4345039894452472,
"grad_norm": 0.5305449962615967,
"learning_rate": 6.135421872589369e-05,
"loss": 2.1603,
"step": 1729
},
{
"epoch": 0.43475529308286737,
"grad_norm": 0.39215922355651855,
"learning_rate": 6.13152800174135e-05,
"loss": 2.0211,
"step": 1730
},
{
"epoch": 0.4350065967204875,
"grad_norm": 0.4130334258079529,
"learning_rate": 6.127633407455909e-05,
"loss": 2.0349,
"step": 1731
},
{
"epoch": 0.4352579003581077,
"grad_norm": 0.4265572428703308,
"learning_rate": 6.123738092223036e-05,
"loss": 1.9254,
"step": 1732
},
{
"epoch": 0.43550920399572784,
"grad_norm": 0.43121498823165894,
"learning_rate": 6.119842058533185e-05,
"loss": 2.0005,
"step": 1733
},
{
"epoch": 0.435760507633348,
"grad_norm": 0.4759058952331543,
"learning_rate": 6.115945308877269e-05,
"loss": 2.1132,
"step": 1734
},
{
"epoch": 0.43601181127096816,
"grad_norm": 0.278901606798172,
"learning_rate": 6.112047845746654e-05,
"loss": 2.3716,
"step": 1735
},
{
"epoch": 0.4362631149085883,
"grad_norm": 0.2603948414325714,
"learning_rate": 6.10814967163317e-05,
"loss": 1.7418,
"step": 1736
},
{
"epoch": 0.4365144185462085,
"grad_norm": 0.4486573338508606,
"learning_rate": 6.104250789029096e-05,
"loss": 2.3385,
"step": 1737
},
{
"epoch": 0.4367657221838286,
"grad_norm": 0.23672929406166077,
"learning_rate": 6.1003512004271634e-05,
"loss": 2.1366,
"step": 1738
},
{
"epoch": 0.43701702582144875,
"grad_norm": 0.31534305214881897,
"learning_rate": 6.09645090832056e-05,
"loss": 1.7805,
"step": 1739
},
{
"epoch": 0.4372683294590689,
"grad_norm": 0.34337756037712097,
"learning_rate": 6.0925499152029175e-05,
"loss": 2.5157,
"step": 1740
},
{
"epoch": 0.43751963309668906,
"grad_norm": 0.4734466075897217,
"learning_rate": 6.0886482235683195e-05,
"loss": 1.9865,
"step": 1741
},
{
"epoch": 0.4377709367343092,
"grad_norm": 0.49610087275505066,
"learning_rate": 6.084745835911296e-05,
"loss": 1.8706,
"step": 1742
},
{
"epoch": 0.4380222403719294,
"grad_norm": 0.2000686675310135,
"learning_rate": 6.080842754726821e-05,
"loss": 2.1542,
"step": 1743
},
{
"epoch": 0.43827354400954954,
"grad_norm": 0.3884282410144806,
"learning_rate": 6.076938982510312e-05,
"loss": 2.4855,
"step": 1744
},
{
"epoch": 0.4385248476471697,
"grad_norm": 0.2191421538591385,
"learning_rate": 6.0730345217576276e-05,
"loss": 1.9066,
"step": 1745
},
{
"epoch": 0.43877615128478986,
"grad_norm": 0.37026357650756836,
"learning_rate": 6.06912937496507e-05,
"loss": 2.5894,
"step": 1746
},
{
"epoch": 0.43902745492241,
"grad_norm": 0.32741832733154297,
"learning_rate": 6.065223544629375e-05,
"loss": 2.3304,
"step": 1747
},
{
"epoch": 0.4392787585600302,
"grad_norm": 0.46646368503570557,
"learning_rate": 6.0613170332477185e-05,
"loss": 2.392,
"step": 1748
},
{
"epoch": 0.4395300621976503,
"grad_norm": 0.2375309318304062,
"learning_rate": 6.057409843317713e-05,
"loss": 1.1848,
"step": 1749
},
{
"epoch": 0.43978136583527044,
"grad_norm": 0.6087144613265991,
"learning_rate": 6.0535019773374014e-05,
"loss": 2.0747,
"step": 1750
},
{
"epoch": 0.4400326694728906,
"grad_norm": 0.33546391129493713,
"learning_rate": 6.04959343780526e-05,
"loss": 2.0869,
"step": 1751
},
{
"epoch": 0.44028397311051076,
"grad_norm": 0.4496229887008667,
"learning_rate": 6.0456842272201974e-05,
"loss": 2.2199,
"step": 1752
},
{
"epoch": 0.4405352767481309,
"grad_norm": 0.3253905773162842,
"learning_rate": 6.04177434808155e-05,
"loss": 2.2447,
"step": 1753
},
{
"epoch": 0.4407865803857511,
"grad_norm": 0.3203786313533783,
"learning_rate": 6.037863802889082e-05,
"loss": 2.1416,
"step": 1754
},
{
"epoch": 0.44103788402337124,
"grad_norm": 0.38746264576911926,
"learning_rate": 6.033952594142983e-05,
"loss": 2.1872,
"step": 1755
},
{
"epoch": 0.4412891876609914,
"grad_norm": 0.4595598578453064,
"learning_rate": 6.030040724343866e-05,
"loss": 2.002,
"step": 1756
},
{
"epoch": 0.44154049129861156,
"grad_norm": 0.28612473607063293,
"learning_rate": 6.0261281959927694e-05,
"loss": 2.1546,
"step": 1757
},
{
"epoch": 0.4417917949362317,
"grad_norm": 4.791689872741699,
"learning_rate": 6.022215011591148e-05,
"loss": 2.2476,
"step": 1758
},
{
"epoch": 0.4420430985738519,
"grad_norm": 0.5441824793815613,
"learning_rate": 6.0183011736408825e-05,
"loss": 2.5157,
"step": 1759
},
{
"epoch": 0.44229440221147204,
"grad_norm": 0.42228463292121887,
"learning_rate": 6.014386684644265e-05,
"loss": 2.3466,
"step": 1760
},
{
"epoch": 0.44254570584909214,
"grad_norm": 0.3516453504562378,
"learning_rate": 6.01047154710401e-05,
"loss": 2.0042,
"step": 1761
},
{
"epoch": 0.4427970094867123,
"grad_norm": 0.34274590015411377,
"learning_rate": 6.006555763523239e-05,
"loss": 2.2938,
"step": 1762
},
{
"epoch": 0.44304831312433246,
"grad_norm": 0.36631983518600464,
"learning_rate": 6.0026393364054957e-05,
"loss": 2.131,
"step": 1763
},
{
"epoch": 0.4432996167619526,
"grad_norm": 0.3376319110393524,
"learning_rate": 5.998722268254726e-05,
"loss": 1.9654,
"step": 1764
},
{
"epoch": 0.4435509203995728,
"grad_norm": 0.27467769384384155,
"learning_rate": 5.994804561575294e-05,
"loss": 1.5135,
"step": 1765
},
{
"epoch": 0.44380222403719294,
"grad_norm": 0.5476408004760742,
"learning_rate": 5.990886218871965e-05,
"loss": 2.0412,
"step": 1766
},
{
"epoch": 0.4440535276748131,
"grad_norm": 0.4842160940170288,
"learning_rate": 5.986967242649916e-05,
"loss": 2.4164,
"step": 1767
},
{
"epoch": 0.44430483131243326,
"grad_norm": 0.2067529559135437,
"learning_rate": 5.983047635414726e-05,
"loss": 0.7425,
"step": 1768
},
{
"epoch": 0.4445561349500534,
"grad_norm": 0.4682113826274872,
"learning_rate": 5.9791273996723785e-05,
"loss": 2.1776,
"step": 1769
},
{
"epoch": 0.4448074385876736,
"grad_norm": 0.5939032435417175,
"learning_rate": 5.975206537929259e-05,
"loss": 2.251,
"step": 1770
},
{
"epoch": 0.44505874222529374,
"grad_norm": 0.4370846748352051,
"learning_rate": 5.971285052692155e-05,
"loss": 2.518,
"step": 1771
},
{
"epoch": 0.44531004586291384,
"grad_norm": 0.42708754539489746,
"learning_rate": 5.967362946468248e-05,
"loss": 1.8735,
"step": 1772
},
{
"epoch": 0.445561349500534,
"grad_norm": 0.3979972302913666,
"learning_rate": 5.96344022176512e-05,
"loss": 2.3454,
"step": 1773
},
{
"epoch": 0.44581265313815416,
"grad_norm": 0.33092716336250305,
"learning_rate": 5.95951688109075e-05,
"loss": 2.4123,
"step": 1774
},
{
"epoch": 0.4460639567757743,
"grad_norm": 0.4420959949493408,
"learning_rate": 5.955592926953505e-05,
"loss": 1.9745,
"step": 1775
},
{
"epoch": 0.4463152604133945,
"grad_norm": 0.21554313600063324,
"learning_rate": 5.951668361862149e-05,
"loss": 0.8182,
"step": 1776
},
{
"epoch": 0.44656656405101464,
"grad_norm": 0.4098570644855499,
"learning_rate": 5.947743188325837e-05,
"loss": 1.7592,
"step": 1777
},
{
"epoch": 0.4468178676886348,
"grad_norm": 0.2512660324573517,
"learning_rate": 5.94381740885411e-05,
"loss": 1.8342,
"step": 1778
},
{
"epoch": 0.44706917132625495,
"grad_norm": 0.49591559171676636,
"learning_rate": 5.939891025956896e-05,
"loss": 2.1854,
"step": 1779
},
{
"epoch": 0.4473204749638751,
"grad_norm": 0.5562049746513367,
"learning_rate": 5.935964042144515e-05,
"loss": 2.0548,
"step": 1780
},
{
"epoch": 0.4475717786014953,
"grad_norm": 0.2882930338382721,
"learning_rate": 5.932036459927662e-05,
"loss": 1.7693,
"step": 1781
},
{
"epoch": 0.44782308223911543,
"grad_norm": 0.4819808304309845,
"learning_rate": 5.928108281817422e-05,
"loss": 2.1611,
"step": 1782
},
{
"epoch": 0.4480743858767356,
"grad_norm": 0.4501156210899353,
"learning_rate": 5.924179510325258e-05,
"loss": 1.9628,
"step": 1783
},
{
"epoch": 0.4483256895143557,
"grad_norm": 0.43177562952041626,
"learning_rate": 5.920250147963013e-05,
"loss": 1.9167,
"step": 1784
},
{
"epoch": 0.44857699315197586,
"grad_norm": 0.398242712020874,
"learning_rate": 5.916320197242905e-05,
"loss": 2.2221,
"step": 1785
},
{
"epoch": 0.448828296789596,
"grad_norm": 1.2034392356872559,
"learning_rate": 5.912389660677533e-05,
"loss": 2.3411,
"step": 1786
},
{
"epoch": 0.4490796004272162,
"grad_norm": 0.48686063289642334,
"learning_rate": 5.908458540779868e-05,
"loss": 1.9525,
"step": 1787
},
{
"epoch": 0.44933090406483633,
"grad_norm": 0.22618024051189423,
"learning_rate": 5.9045268400632524e-05,
"loss": 1.9921,
"step": 1788
},
{
"epoch": 0.4495822077024565,
"grad_norm": 0.45006734132766724,
"learning_rate": 5.9005945610414036e-05,
"loss": 2.3134,
"step": 1789
},
{
"epoch": 0.44983351134007665,
"grad_norm": 0.22294217348098755,
"learning_rate": 5.8966617062284066e-05,
"loss": 2.1551,
"step": 1790
},
{
"epoch": 0.4500848149776968,
"grad_norm": 0.29651448130607605,
"learning_rate": 5.8927282781387147e-05,
"loss": 2.3817,
"step": 1791
},
{
"epoch": 0.45033611861531697,
"grad_norm": 0.23336410522460938,
"learning_rate": 5.888794279287146e-05,
"loss": 2.0842,
"step": 1792
},
{
"epoch": 0.45058742225293713,
"grad_norm": 0.5561927556991577,
"learning_rate": 5.884859712188887e-05,
"loss": 2.1477,
"step": 1793
},
{
"epoch": 0.4508387258905573,
"grad_norm": 0.537323534488678,
"learning_rate": 5.880924579359486e-05,
"loss": 2.0926,
"step": 1794
},
{
"epoch": 0.4510900295281774,
"grad_norm": 0.47504621744155884,
"learning_rate": 5.8769888833148514e-05,
"loss": 2.1737,
"step": 1795
},
{
"epoch": 0.45134133316579755,
"grad_norm": 0.3701205551624298,
"learning_rate": 5.8730526265712535e-05,
"loss": 2.5034,
"step": 1796
},
{
"epoch": 0.4515926368034177,
"grad_norm": 0.2942044138908386,
"learning_rate": 5.869115811645322e-05,
"loss": 2.1007,
"step": 1797
},
{
"epoch": 0.4518439404410379,
"grad_norm": 0.3349366784095764,
"learning_rate": 5.86517844105404e-05,
"loss": 1.9833,
"step": 1798
},
{
"epoch": 0.45209524407865803,
"grad_norm": 0.41249701380729675,
"learning_rate": 5.861240517314749e-05,
"loss": 2.287,
"step": 1799
},
{
"epoch": 0.4523465477162782,
"grad_norm": 0.2055690586566925,
"learning_rate": 5.8573020429451455e-05,
"loss": 2.4701,
"step": 1800
},
{
"epoch": 0.45259785135389835,
"grad_norm": 0.400979220867157,
"learning_rate": 5.8533630204632704e-05,
"loss": 2.6295,
"step": 1801
},
{
"epoch": 0.4528491549915185,
"grad_norm": 0.45950108766555786,
"learning_rate": 5.8494234523875234e-05,
"loss": 2.2317,
"step": 1802
},
{
"epoch": 0.45310045862913867,
"grad_norm": 0.3457741439342499,
"learning_rate": 5.84548334123665e-05,
"loss": 2.4107,
"step": 1803
},
{
"epoch": 0.45335176226675883,
"grad_norm": 0.5182722806930542,
"learning_rate": 5.8415426895297434e-05,
"loss": 2.4685,
"step": 1804
},
{
"epoch": 0.453603065904379,
"grad_norm": 0.4126606583595276,
"learning_rate": 5.837601499786239e-05,
"loss": 2.6861,
"step": 1805
},
{
"epoch": 0.45385436954199915,
"grad_norm": 0.4424208998680115,
"learning_rate": 5.8336597745259226e-05,
"loss": 2.191,
"step": 1806
},
{
"epoch": 0.45410567317961925,
"grad_norm": 0.5643180012702942,
"learning_rate": 5.8297175162689164e-05,
"loss": 2.5079,
"step": 1807
},
{
"epoch": 0.4543569768172394,
"grad_norm": 0.351386159658432,
"learning_rate": 5.825774727535688e-05,
"loss": 2.2525,
"step": 1808
},
{
"epoch": 0.45460828045485957,
"grad_norm": 0.5042213201522827,
"learning_rate": 5.8218314108470385e-05,
"loss": 2.1878,
"step": 1809
},
{
"epoch": 0.45485958409247973,
"grad_norm": 0.4025678038597107,
"learning_rate": 5.817887568724113e-05,
"loss": 2.2556,
"step": 1810
},
{
"epoch": 0.4551108877300999,
"grad_norm": 0.5732985138893127,
"learning_rate": 5.8139432036883875e-05,
"loss": 1.8569,
"step": 1811
},
{
"epoch": 0.45536219136772005,
"grad_norm": 0.35006287693977356,
"learning_rate": 5.809998318261677e-05,
"loss": 1.04,
"step": 1812
},
{
"epoch": 0.4556134950053402,
"grad_norm": 0.4501281678676605,
"learning_rate": 5.806052914966124e-05,
"loss": 2.293,
"step": 1813
},
{
"epoch": 0.45586479864296037,
"grad_norm": 0.3089803457260132,
"learning_rate": 5.802106996324206e-05,
"loss": 2.4766,
"step": 1814
},
{
"epoch": 0.4561161022805805,
"grad_norm": 0.38950660824775696,
"learning_rate": 5.7981605648587264e-05,
"loss": 1.977,
"step": 1815
},
{
"epoch": 0.4563674059182007,
"grad_norm": 0.3673931062221527,
"learning_rate": 5.7942136230928226e-05,
"loss": 2.0008,
"step": 1816
},
{
"epoch": 0.45661870955582085,
"grad_norm": 0.378292441368103,
"learning_rate": 5.790266173549951e-05,
"loss": 2.3145,
"step": 1817
},
{
"epoch": 0.45687001319344095,
"grad_norm": 0.1496533304452896,
"learning_rate": 5.786318218753898e-05,
"loss": 1.2677,
"step": 1818
},
{
"epoch": 0.4571213168310611,
"grad_norm": 0.1584922969341278,
"learning_rate": 5.78236976122877e-05,
"loss": 0.6397,
"step": 1819
},
{
"epoch": 0.45737262046868127,
"grad_norm": 0.37663987278938293,
"learning_rate": 5.778420803498995e-05,
"loss": 2.0862,
"step": 1820
},
{
"epoch": 0.4576239241063014,
"grad_norm": 0.34862029552459717,
"learning_rate": 5.774471348089323e-05,
"loss": 1.7093,
"step": 1821
},
{
"epoch": 0.4578752277439216,
"grad_norm": 0.3889473080635071,
"learning_rate": 5.7705213975248216e-05,
"loss": 2.127,
"step": 1822
},
{
"epoch": 0.45812653138154175,
"grad_norm": 0.5065658092498779,
"learning_rate": 5.766570954330872e-05,
"loss": 2.1475,
"step": 1823
},
{
"epoch": 0.4583778350191619,
"grad_norm": 0.3527318239212036,
"learning_rate": 5.7626200210331746e-05,
"loss": 2.1492,
"step": 1824
},
{
"epoch": 0.45862913865678206,
"grad_norm": 0.39478063583374023,
"learning_rate": 5.75866860015774e-05,
"loss": 2.1778,
"step": 1825
},
{
"epoch": 0.4588804422944022,
"grad_norm": 0.5078315734863281,
"learning_rate": 5.7547166942308925e-05,
"loss": 2.2873,
"step": 1826
},
{
"epoch": 0.4591317459320224,
"grad_norm": 0.46022409200668335,
"learning_rate": 5.750764305779265e-05,
"loss": 2.1011,
"step": 1827
},
{
"epoch": 0.45938304956964254,
"grad_norm": 0.4178464114665985,
"learning_rate": 5.7468114373297995e-05,
"loss": 2.2731,
"step": 1828
},
{
"epoch": 0.4596343532072627,
"grad_norm": 0.4607785940170288,
"learning_rate": 5.7428580914097465e-05,
"loss": 2.2106,
"step": 1829
},
{
"epoch": 0.4598856568448828,
"grad_norm": 0.518441915512085,
"learning_rate": 5.738904270546658e-05,
"loss": 2.5474,
"step": 1830
},
{
"epoch": 0.46013696048250297,
"grad_norm": 0.39153853058815,
"learning_rate": 5.7349499772683927e-05,
"loss": 2.2149,
"step": 1831
},
{
"epoch": 0.4603882641201231,
"grad_norm": 0.3753896653652191,
"learning_rate": 5.7309952141031095e-05,
"loss": 2.1979,
"step": 1832
},
{
"epoch": 0.4606395677577433,
"grad_norm": 0.4839867949485779,
"learning_rate": 5.727039983579271e-05,
"loss": 2.1297,
"step": 1833
},
{
"epoch": 0.46089087139536344,
"grad_norm": 0.4048977196216583,
"learning_rate": 5.7230842882256354e-05,
"loss": 2.5347,
"step": 1834
},
{
"epoch": 0.4611421750329836,
"grad_norm": 0.3666813373565674,
"learning_rate": 5.719128130571257e-05,
"loss": 2.4136,
"step": 1835
},
{
"epoch": 0.46139347867060376,
"grad_norm": 0.3380883038043976,
"learning_rate": 5.715171513145492e-05,
"loss": 1.8449,
"step": 1836
},
{
"epoch": 0.4616447823082239,
"grad_norm": 0.28998619318008423,
"learning_rate": 5.711214438477982e-05,
"loss": 2.1513,
"step": 1837
},
{
"epoch": 0.4618960859458441,
"grad_norm": 0.45006147027015686,
"learning_rate": 5.7072569090986675e-05,
"loss": 1.8252,
"step": 1838
},
{
"epoch": 0.46214738958346424,
"grad_norm": 0.39759132266044617,
"learning_rate": 5.703298927537777e-05,
"loss": 1.5816,
"step": 1839
},
{
"epoch": 0.4623986932210844,
"grad_norm": 0.3186738193035126,
"learning_rate": 5.699340496325828e-05,
"loss": 2.3996,
"step": 1840
},
{
"epoch": 0.4626499968587045,
"grad_norm": 0.426439106464386,
"learning_rate": 5.695381617993626e-05,
"loss": 2.0965,
"step": 1841
},
{
"epoch": 0.46290130049632466,
"grad_norm": 0.291789710521698,
"learning_rate": 5.6914222950722626e-05,
"loss": 2.4023,
"step": 1842
},
{
"epoch": 0.4631526041339448,
"grad_norm": 0.4969780743122101,
"learning_rate": 5.687462530093115e-05,
"loss": 1.6636,
"step": 1843
},
{
"epoch": 0.463403907771565,
"grad_norm": 0.3465602993965149,
"learning_rate": 5.6835023255878384e-05,
"loss": 2.7812,
"step": 1844
},
{
"epoch": 0.46365521140918514,
"grad_norm": 0.30250221490859985,
"learning_rate": 5.6795416840883744e-05,
"loss": 2.0632,
"step": 1845
},
{
"epoch": 0.4639065150468053,
"grad_norm": 0.1709776073694229,
"learning_rate": 5.675580608126939e-05,
"loss": 1.5832,
"step": 1846
},
{
"epoch": 0.46415781868442546,
"grad_norm": 0.47186335921287537,
"learning_rate": 5.671619100236032e-05,
"loss": 2.7179,
"step": 1847
},
{
"epoch": 0.4644091223220456,
"grad_norm": 0.3553767204284668,
"learning_rate": 5.6676571629484244e-05,
"loss": 2.5521,
"step": 1848
},
{
"epoch": 0.4646604259596658,
"grad_norm": 0.41556620597839355,
"learning_rate": 5.663694798797162e-05,
"loss": 1.9523,
"step": 1849
},
{
"epoch": 0.46491172959728594,
"grad_norm": 0.4675963819026947,
"learning_rate": 5.659732010315567e-05,
"loss": 2.045,
"step": 1850
},
{
"epoch": 0.4651630332349061,
"grad_norm": 0.41482940316200256,
"learning_rate": 5.655768800037229e-05,
"loss": 2.0328,
"step": 1851
},
{
"epoch": 0.46541433687252626,
"grad_norm": 0.4568209648132324,
"learning_rate": 5.6518051704960094e-05,
"loss": 1.8168,
"step": 1852
},
{
"epoch": 0.46566564051014636,
"grad_norm": 0.5529626607894897,
"learning_rate": 5.6478411242260376e-05,
"loss": 1.9009,
"step": 1853
},
{
"epoch": 0.4659169441477665,
"grad_norm": 0.5022344589233398,
"learning_rate": 5.643876663761709e-05,
"loss": 2.038,
"step": 1854
},
{
"epoch": 0.4661682477853867,
"grad_norm": 0.32788538932800293,
"learning_rate": 5.6399117916376844e-05,
"loss": 2.186,
"step": 1855
},
{
"epoch": 0.46641955142300684,
"grad_norm": 0.38160184025764465,
"learning_rate": 5.6359465103888854e-05,
"loss": 1.8572,
"step": 1856
},
{
"epoch": 0.466670855060627,
"grad_norm": 0.17872734367847443,
"learning_rate": 5.631980822550499e-05,
"loss": 0.8302,
"step": 1857
},
{
"epoch": 0.46692215869824716,
"grad_norm": 0.4570051431655884,
"learning_rate": 5.628014730657969e-05,
"loss": 1.8124,
"step": 1858
},
{
"epoch": 0.4671734623358673,
"grad_norm": 0.36893993616104126,
"learning_rate": 5.6240482372469995e-05,
"loss": 2.4174,
"step": 1859
},
{
"epoch": 0.4674247659734875,
"grad_norm": 0.4820000231266022,
"learning_rate": 5.62008134485355e-05,
"loss": 1.9411,
"step": 1860
},
{
"epoch": 0.46767606961110764,
"grad_norm": 0.2347617745399475,
"learning_rate": 5.616114056013835e-05,
"loss": 1.9671,
"step": 1861
},
{
"epoch": 0.4679273732487278,
"grad_norm": 0.381150484085083,
"learning_rate": 5.6121463732643255e-05,
"loss": 2.0366,
"step": 1862
},
{
"epoch": 0.46817867688634796,
"grad_norm": 0.6411362290382385,
"learning_rate": 5.6081782991417384e-05,
"loss": 2.3673,
"step": 1863
},
{
"epoch": 0.46842998052396806,
"grad_norm": 0.36828893423080444,
"learning_rate": 5.6042098361830464e-05,
"loss": 2.8204,
"step": 1864
},
{
"epoch": 0.4686812841615882,
"grad_norm": 0.31962066888809204,
"learning_rate": 5.600240986925469e-05,
"loss": 2.2475,
"step": 1865
},
{
"epoch": 0.4689325877992084,
"grad_norm": 0.3448081910610199,
"learning_rate": 5.5962717539064716e-05,
"loss": 2.1665,
"step": 1866
},
{
"epoch": 0.46918389143682854,
"grad_norm": 0.5330995917320251,
"learning_rate": 5.5923021396637656e-05,
"loss": 2.2013,
"step": 1867
},
{
"epoch": 0.4694351950744487,
"grad_norm": 0.4247521460056305,
"learning_rate": 5.588332146735307e-05,
"loss": 2.2438,
"step": 1868
},
{
"epoch": 0.46968649871206886,
"grad_norm": 0.4937816262245178,
"learning_rate": 5.584361777659293e-05,
"loss": 2.1838,
"step": 1869
},
{
"epoch": 0.469937802349689,
"grad_norm": 0.3027281165122986,
"learning_rate": 5.5803910349741615e-05,
"loss": 2.1061,
"step": 1870
},
{
"epoch": 0.4701891059873092,
"grad_norm": 0.40652328729629517,
"learning_rate": 5.5764199212185895e-05,
"loss": 1.9048,
"step": 1871
},
{
"epoch": 0.47044040962492933,
"grad_norm": 0.4994272291660309,
"learning_rate": 5.5724484389314904e-05,
"loss": 2.3435,
"step": 1872
},
{
"epoch": 0.4706917132625495,
"grad_norm": 0.38275086879730225,
"learning_rate": 5.568476590652014e-05,
"loss": 2.264,
"step": 1873
},
{
"epoch": 0.47094301690016965,
"grad_norm": 0.33458179235458374,
"learning_rate": 5.564504378919544e-05,
"loss": 2.2379,
"step": 1874
},
{
"epoch": 0.47119432053778976,
"grad_norm": 0.4976329207420349,
"learning_rate": 5.560531806273697e-05,
"loss": 2.2247,
"step": 1875
},
{
"epoch": 0.4714456241754099,
"grad_norm": 0.34130215644836426,
"learning_rate": 5.55655887525432e-05,
"loss": 2.3974,
"step": 1876
},
{
"epoch": 0.4716969278130301,
"grad_norm": 0.4960635304450989,
"learning_rate": 5.5525855884014867e-05,
"loss": 2.3234,
"step": 1877
},
{
"epoch": 0.47194823145065024,
"grad_norm": 0.31311216950416565,
"learning_rate": 5.548611948255502e-05,
"loss": 2.1189,
"step": 1878
},
{
"epoch": 0.4721995350882704,
"grad_norm": 0.3007585406303406,
"learning_rate": 5.5446379573568964e-05,
"loss": 2.2485,
"step": 1879
},
{
"epoch": 0.47245083872589055,
"grad_norm": 0.4978335499763489,
"learning_rate": 5.54066361824642e-05,
"loss": 2.2486,
"step": 1880
},
{
"epoch": 0.4727021423635107,
"grad_norm": 0.3356104791164398,
"learning_rate": 5.536688933465053e-05,
"loss": 2.4258,
"step": 1881
},
{
"epoch": 0.4729534460011309,
"grad_norm": 0.3924019932746887,
"learning_rate": 5.532713905553989e-05,
"loss": 2.2866,
"step": 1882
},
{
"epoch": 0.47320474963875103,
"grad_norm": 0.5366889238357544,
"learning_rate": 5.5287385370546476e-05,
"loss": 2.3925,
"step": 1883
},
{
"epoch": 0.4734560532763712,
"grad_norm": 0.37187063694000244,
"learning_rate": 5.5247628305086596e-05,
"loss": 2.3975,
"step": 1884
},
{
"epoch": 0.47370735691399135,
"grad_norm": 0.5848695635795593,
"learning_rate": 5.520786788457879e-05,
"loss": 2.2758,
"step": 1885
},
{
"epoch": 0.4739586605516115,
"grad_norm": 0.359260231256485,
"learning_rate": 5.5168104134443674e-05,
"loss": 2.2616,
"step": 1886
},
{
"epoch": 0.4742099641892316,
"grad_norm": 0.25694742798805237,
"learning_rate": 5.512833708010404e-05,
"loss": 2.4823,
"step": 1887
},
{
"epoch": 0.4744612678268518,
"grad_norm": 0.4734945595264435,
"learning_rate": 5.508856674698479e-05,
"loss": 2.8068,
"step": 1888
},
{
"epoch": 0.47471257146447193,
"grad_norm": 0.2672096788883209,
"learning_rate": 5.5048793160512914e-05,
"loss": 2.4088,
"step": 1889
},
{
"epoch": 0.4749638751020921,
"grad_norm": 0.3386220932006836,
"learning_rate": 5.500901634611745e-05,
"loss": 2.0906,
"step": 1890
},
{
"epoch": 0.47521517873971225,
"grad_norm": 0.41815564036369324,
"learning_rate": 5.4969236329229544e-05,
"loss": 1.9785,
"step": 1891
},
{
"epoch": 0.4754664823773324,
"grad_norm": 0.5156980752944946,
"learning_rate": 5.492945313528237e-05,
"loss": 1.9482,
"step": 1892
},
{
"epoch": 0.47571778601495257,
"grad_norm": 0.5490496754646301,
"learning_rate": 5.488966678971115e-05,
"loss": 2.1706,
"step": 1893
},
{
"epoch": 0.47596908965257273,
"grad_norm": 0.47855961322784424,
"learning_rate": 5.484987731795309e-05,
"loss": 2.2599,
"step": 1894
},
{
"epoch": 0.4762203932901929,
"grad_norm": 0.31765252351760864,
"learning_rate": 5.481008474544742e-05,
"loss": 1.9829,
"step": 1895
},
{
"epoch": 0.47647169692781305,
"grad_norm": 0.3568671941757202,
"learning_rate": 5.477028909763535e-05,
"loss": 2.6977,
"step": 1896
},
{
"epoch": 0.4767230005654332,
"grad_norm": 0.6139686703681946,
"learning_rate": 5.473049039996005e-05,
"loss": 1.8471,
"step": 1897
},
{
"epoch": 0.4769743042030533,
"grad_norm": 0.31178557872772217,
"learning_rate": 5.469068867786663e-05,
"loss": 2.2934,
"step": 1898
},
{
"epoch": 0.47722560784067347,
"grad_norm": 0.20065036416053772,
"learning_rate": 5.465088395680216e-05,
"loss": 1.1206,
"step": 1899
},
{
"epoch": 0.47747691147829363,
"grad_norm": 0.5498302578926086,
"learning_rate": 5.461107626221561e-05,
"loss": 1.8837,
"step": 1900
},
{
"epoch": 0.4777282151159138,
"grad_norm": 0.34776148200035095,
"learning_rate": 5.4571265619557846e-05,
"loss": 2.091,
"step": 1901
},
{
"epoch": 0.47797951875353395,
"grad_norm": 0.4100053906440735,
"learning_rate": 5.453145205428163e-05,
"loss": 2.5989,
"step": 1902
},
{
"epoch": 0.4782308223911541,
"grad_norm": 0.47446209192276,
"learning_rate": 5.4491635591841586e-05,
"loss": 2.3832,
"step": 1903
},
{
"epoch": 0.47848212602877427,
"grad_norm": 0.44983819127082825,
"learning_rate": 5.4451816257694185e-05,
"loss": 2.5454,
"step": 1904
},
{
"epoch": 0.4787334296663944,
"grad_norm": 0.3998521566390991,
"learning_rate": 5.441199407729777e-05,
"loss": 2.5369,
"step": 1905
},
{
"epoch": 0.4789847333040146,
"grad_norm": 0.5979151725769043,
"learning_rate": 5.437216907611246e-05,
"loss": 2.1094,
"step": 1906
},
{
"epoch": 0.47923603694163475,
"grad_norm": 0.5320187211036682,
"learning_rate": 5.433234127960018e-05,
"loss": 2.1983,
"step": 1907
},
{
"epoch": 0.4794873405792549,
"grad_norm": 0.3964744210243225,
"learning_rate": 5.4292510713224676e-05,
"loss": 2.4393,
"step": 1908
},
{
"epoch": 0.47973864421687507,
"grad_norm": 0.271665096282959,
"learning_rate": 5.4252677402451435e-05,
"loss": 2.0589,
"step": 1909
},
{
"epoch": 0.47998994785449517,
"grad_norm": 0.40751466155052185,
"learning_rate": 5.42128413727477e-05,
"loss": 1.7317,
"step": 1910
},
{
"epoch": 0.48024125149211533,
"grad_norm": 0.28429216146469116,
"learning_rate": 5.417300264958248e-05,
"loss": 2.6466,
"step": 1911
},
{
"epoch": 0.4804925551297355,
"grad_norm": 0.5931762456893921,
"learning_rate": 5.413316125842647e-05,
"loss": 2.1482,
"step": 1912
},
{
"epoch": 0.48074385876735565,
"grad_norm": 0.3537997305393219,
"learning_rate": 5.409331722475211e-05,
"loss": 2.1424,
"step": 1913
},
{
"epoch": 0.4809951624049758,
"grad_norm": 0.3427460789680481,
"learning_rate": 5.405347057403346e-05,
"loss": 2.3135,
"step": 1914
},
{
"epoch": 0.48124646604259597,
"grad_norm": 0.542516827583313,
"learning_rate": 5.401362133174635e-05,
"loss": 2.2244,
"step": 1915
},
{
"epoch": 0.4814977696802161,
"grad_norm": 0.32623621821403503,
"learning_rate": 5.397376952336819e-05,
"loss": 1.7302,
"step": 1916
},
{
"epoch": 0.4817490733178363,
"grad_norm": 0.334738552570343,
"learning_rate": 5.3933915174378066e-05,
"loss": 2.1778,
"step": 1917
},
{
"epoch": 0.48200037695545644,
"grad_norm": 0.4472411572933197,
"learning_rate": 5.389405831025668e-05,
"loss": 2.1097,
"step": 1918
},
{
"epoch": 0.4822516805930766,
"grad_norm": 0.43921470642089844,
"learning_rate": 5.385419895648633e-05,
"loss": 2.196,
"step": 1919
},
{
"epoch": 0.48250298423069676,
"grad_norm": 0.4858810007572174,
"learning_rate": 5.381433713855094e-05,
"loss": 2.0108,
"step": 1920
},
{
"epoch": 0.48275428786831687,
"grad_norm": 0.5167921781539917,
"learning_rate": 5.3774472881935965e-05,
"loss": 2.342,
"step": 1921
},
{
"epoch": 0.483005591505937,
"grad_norm": 0.27940118312835693,
"learning_rate": 5.3734606212128446e-05,
"loss": 2.4511,
"step": 1922
},
{
"epoch": 0.4832568951435572,
"grad_norm": 0.2919957935810089,
"learning_rate": 5.369473715461698e-05,
"loss": 2.1458,
"step": 1923
},
{
"epoch": 0.48350819878117735,
"grad_norm": 0.43019670248031616,
"learning_rate": 5.365486573489166e-05,
"loss": 2.2083,
"step": 1924
},
{
"epoch": 0.4837595024187975,
"grad_norm": 0.39743998646736145,
"learning_rate": 5.361499197844409e-05,
"loss": 2.1078,
"step": 1925
},
{
"epoch": 0.48401080605641766,
"grad_norm": 0.33505332469940186,
"learning_rate": 5.3575115910767395e-05,
"loss": 2.1952,
"step": 1926
},
{
"epoch": 0.4842621096940378,
"grad_norm": 0.2203834056854248,
"learning_rate": 5.3535237557356146e-05,
"loss": 2.282,
"step": 1927
},
{
"epoch": 0.484513413331658,
"grad_norm": 0.7342590093612671,
"learning_rate": 5.349535694370641e-05,
"loss": 1.9875,
"step": 1928
},
{
"epoch": 0.48476471696927814,
"grad_norm": 0.3641369640827179,
"learning_rate": 5.345547409531568e-05,
"loss": 2.1203,
"step": 1929
},
{
"epoch": 0.4850160206068983,
"grad_norm": 2.032423973083496,
"learning_rate": 5.341558903768287e-05,
"loss": 2.0247,
"step": 1930
},
{
"epoch": 0.48526732424451846,
"grad_norm": 0.6103079319000244,
"learning_rate": 5.3375701796308305e-05,
"loss": 2.4254,
"step": 1931
},
{
"epoch": 0.4855186278821386,
"grad_norm": 0.40158334374427795,
"learning_rate": 5.3335812396693744e-05,
"loss": 2.0793,
"step": 1932
},
{
"epoch": 0.4857699315197587,
"grad_norm": 0.5189076066017151,
"learning_rate": 5.329592086434228e-05,
"loss": 1.7984,
"step": 1933
},
{
"epoch": 0.4860212351573789,
"grad_norm": 0.3564301133155823,
"learning_rate": 5.32560272247584e-05,
"loss": 2.0899,
"step": 1934
},
{
"epoch": 0.48627253879499904,
"grad_norm": 0.3177907168865204,
"learning_rate": 5.3216131503447895e-05,
"loss": 2.6728,
"step": 1935
},
{
"epoch": 0.4865238424326192,
"grad_norm": 0.5341473817825317,
"learning_rate": 5.3176233725917954e-05,
"loss": 2.4917,
"step": 1936
},
{
"epoch": 0.48677514607023936,
"grad_norm": 1.1832932233810425,
"learning_rate": 5.313633391767704e-05,
"loss": 1.5361,
"step": 1937
},
{
"epoch": 0.4870264497078595,
"grad_norm": 0.4320741295814514,
"learning_rate": 5.3096432104234895e-05,
"loss": 2.0366,
"step": 1938
},
{
"epoch": 0.4872777533454797,
"grad_norm": 0.347154825925827,
"learning_rate": 5.305652831110259e-05,
"loss": 1.8777,
"step": 1939
},
{
"epoch": 0.48752905698309984,
"grad_norm": 0.5214780569076538,
"learning_rate": 5.301662256379243e-05,
"loss": 1.953,
"step": 1940
},
{
"epoch": 0.48778036062072,
"grad_norm": 0.3790840804576874,
"learning_rate": 5.2976714887817965e-05,
"loss": 2.1505,
"step": 1941
},
{
"epoch": 0.48803166425834016,
"grad_norm": 0.4315270781517029,
"learning_rate": 5.293680530869403e-05,
"loss": 2.4113,
"step": 1942
},
{
"epoch": 0.4882829678959603,
"grad_norm": 0.48924243450164795,
"learning_rate": 5.289689385193659e-05,
"loss": 2.4325,
"step": 1943
},
{
"epoch": 0.4885342715335804,
"grad_norm": 0.3082104027271271,
"learning_rate": 5.285698054306289e-05,
"loss": 2.4431,
"step": 1944
},
{
"epoch": 0.4887855751712006,
"grad_norm": 0.24867354333400726,
"learning_rate": 5.2817065407591316e-05,
"loss": 1.9332,
"step": 1945
},
{
"epoch": 0.48903687880882074,
"grad_norm": 0.3941580653190613,
"learning_rate": 5.277714847104144e-05,
"loss": 2.3426,
"step": 1946
},
{
"epoch": 0.4892881824464409,
"grad_norm": 0.31942319869995117,
"learning_rate": 5.273722975893397e-05,
"loss": 2.0446,
"step": 1947
},
{
"epoch": 0.48953948608406106,
"grad_norm": 0.4005180299282074,
"learning_rate": 5.269730929679076e-05,
"loss": 2.8318,
"step": 1948
},
{
"epoch": 0.4897907897216812,
"grad_norm": 0.33907705545425415,
"learning_rate": 5.265738711013478e-05,
"loss": 2.6326,
"step": 1949
},
{
"epoch": 0.4900420933593014,
"grad_norm": 0.30913516879081726,
"learning_rate": 5.261746322449012e-05,
"loss": 2.0023,
"step": 1950
},
{
"epoch": 0.49029339699692154,
"grad_norm": 0.5829331874847412,
"learning_rate": 5.257753766538192e-05,
"loss": 2.2518,
"step": 1951
},
{
"epoch": 0.4905447006345417,
"grad_norm": 0.420886367559433,
"learning_rate": 5.25376104583364e-05,
"loss": 1.9362,
"step": 1952
},
{
"epoch": 0.49079600427216186,
"grad_norm": 0.3854474127292633,
"learning_rate": 5.249768162888088e-05,
"loss": 2.3414,
"step": 1953
},
{
"epoch": 0.491047307909782,
"grad_norm": 0.36064305901527405,
"learning_rate": 5.245775120254365e-05,
"loss": 2.0693,
"step": 1954
},
{
"epoch": 0.4912986115474022,
"grad_norm": 0.15027983486652374,
"learning_rate": 5.241781920485407e-05,
"loss": 0.7658,
"step": 1955
},
{
"epoch": 0.4915499151850223,
"grad_norm": 0.22711443901062012,
"learning_rate": 5.2377885661342466e-05,
"loss": 2.4119,
"step": 1956
},
{
"epoch": 0.49180121882264244,
"grad_norm": 0.525962769985199,
"learning_rate": 5.233795059754019e-05,
"loss": 2.5089,
"step": 1957
},
{
"epoch": 0.4920525224602626,
"grad_norm": 0.2939525544643402,
"learning_rate": 5.229801403897954e-05,
"loss": 2.0917,
"step": 1958
},
{
"epoch": 0.49230382609788276,
"grad_norm": 0.2676595151424408,
"learning_rate": 5.2258076011193765e-05,
"loss": 2.0566,
"step": 1959
},
{
"epoch": 0.4925551297355029,
"grad_norm": 0.38721850514411926,
"learning_rate": 5.22181365397171e-05,
"loss": 2.4625,
"step": 1960
},
{
"epoch": 0.4928064333731231,
"grad_norm": 0.4662513732910156,
"learning_rate": 5.217819565008465e-05,
"loss": 2.3879,
"step": 1961
},
{
"epoch": 0.49305773701074324,
"grad_norm": 0.3855404555797577,
"learning_rate": 5.2138253367832446e-05,
"loss": 1.8247,
"step": 1962
},
{
"epoch": 0.4933090406483634,
"grad_norm": 0.38118478655815125,
"learning_rate": 5.2098309718497405e-05,
"loss": 2.5317,
"step": 1963
},
{
"epoch": 0.49356034428598355,
"grad_norm": 0.483946293592453,
"learning_rate": 5.2058364727617325e-05,
"loss": 1.9472,
"step": 1964
},
{
"epoch": 0.4938116479236037,
"grad_norm": 0.32675084471702576,
"learning_rate": 5.2018418420730866e-05,
"loss": 2.0327,
"step": 1965
},
{
"epoch": 0.4940629515612239,
"grad_norm": 0.43735966086387634,
"learning_rate": 5.197847082337752e-05,
"loss": 2.3672,
"step": 1966
},
{
"epoch": 0.494314255198844,
"grad_norm": 0.43739423155784607,
"learning_rate": 5.193852196109761e-05,
"loss": 2.1745,
"step": 1967
},
{
"epoch": 0.49456555883646414,
"grad_norm": 0.24707311391830444,
"learning_rate": 5.189857185943224e-05,
"loss": 2.4274,
"step": 1968
},
{
"epoch": 0.4948168624740843,
"grad_norm": 0.2463517189025879,
"learning_rate": 5.185862054392337e-05,
"loss": 2.0382,
"step": 1969
},
{
"epoch": 0.49506816611170446,
"grad_norm": 0.38449257612228394,
"learning_rate": 5.181866804011367e-05,
"loss": 2.0812,
"step": 1970
},
{
"epoch": 0.4953194697493246,
"grad_norm": 0.3080759644508362,
"learning_rate": 5.17787143735466e-05,
"loss": 1.702,
"step": 1971
},
{
"epoch": 0.4955707733869448,
"grad_norm": 0.5761748552322388,
"learning_rate": 5.173875956976637e-05,
"loss": 2.3456,
"step": 1972
},
{
"epoch": 0.49582207702456493,
"grad_norm": 0.5399059653282166,
"learning_rate": 5.1698803654317904e-05,
"loss": 2.0914,
"step": 1973
},
{
"epoch": 0.4960733806621851,
"grad_norm": 0.317590594291687,
"learning_rate": 5.165884665274683e-05,
"loss": 2.6376,
"step": 1974
},
{
"epoch": 0.49632468429980525,
"grad_norm": 0.2823307514190674,
"learning_rate": 5.161888859059949e-05,
"loss": 1.6927,
"step": 1975
},
{
"epoch": 0.4965759879374254,
"grad_norm": 0.45718204975128174,
"learning_rate": 5.157892949342289e-05,
"loss": 2.3115,
"step": 1976
},
{
"epoch": 0.49682729157504557,
"grad_norm": 0.42325839400291443,
"learning_rate": 5.15389693867647e-05,
"loss": 2.0753,
"step": 1977
},
{
"epoch": 0.49707859521266573,
"grad_norm": 0.34479397535324097,
"learning_rate": 5.149900829617325e-05,
"loss": 2.3803,
"step": 1978
},
{
"epoch": 0.49732989885028583,
"grad_norm": 0.40837740898132324,
"learning_rate": 5.145904624719746e-05,
"loss": 1.6998,
"step": 1979
},
{
"epoch": 0.497581202487906,
"grad_norm": 0.5791382193565369,
"learning_rate": 5.141908326538691e-05,
"loss": 2.0878,
"step": 1980
},
{
"epoch": 0.49783250612552615,
"grad_norm": 0.4563978612422943,
"learning_rate": 5.137911937629176e-05,
"loss": 1.9045,
"step": 1981
},
{
"epoch": 0.4980838097631463,
"grad_norm": 0.6332088708877563,
"learning_rate": 5.1339154605462734e-05,
"loss": 2.5065,
"step": 1982
},
{
"epoch": 0.49833511340076647,
"grad_norm": 0.3342919945716858,
"learning_rate": 5.129918897845114e-05,
"loss": 2.4884,
"step": 1983
},
{
"epoch": 0.49858641703838663,
"grad_norm": 0.24111764132976532,
"learning_rate": 5.125922252080881e-05,
"loss": 2.1311,
"step": 1984
},
{
"epoch": 0.4988377206760068,
"grad_norm": 0.39446836709976196,
"learning_rate": 5.121925525808815e-05,
"loss": 2.2891,
"step": 1985
},
{
"epoch": 0.49908902431362695,
"grad_norm": 0.3553325831890106,
"learning_rate": 5.117928721584202e-05,
"loss": 1.9434,
"step": 1986
},
{
"epoch": 0.4993403279512471,
"grad_norm": 0.20506922900676727,
"learning_rate": 5.1139318419623825e-05,
"loss": 0.9382,
"step": 1987
},
{
"epoch": 0.49959163158886727,
"grad_norm": 0.2444799542427063,
"learning_rate": 5.1099348894987445e-05,
"loss": 2.1564,
"step": 1988
},
{
"epoch": 0.49984293522648743,
"grad_norm": 0.25949880480766296,
"learning_rate": 5.105937866748721e-05,
"loss": 2.593,
"step": 1989
},
{
"epoch": 0.5000942388641075,
"grad_norm": 0.3917735517024994,
"learning_rate": 5.1019407762677906e-05,
"loss": 2.3575,
"step": 1990
},
{
"epoch": 0.5003455425017277,
"grad_norm": 0.49749505519866943,
"learning_rate": 5.097943620611476e-05,
"loss": 1.9251,
"step": 1991
},
{
"epoch": 0.5005968461393479,
"grad_norm": 0.3024258017539978,
"learning_rate": 5.0939464023353414e-05,
"loss": 2.1926,
"step": 1992
},
{
"epoch": 0.500848149776968,
"grad_norm": 0.5735132694244385,
"learning_rate": 5.089949123994989e-05,
"loss": 2.2119,
"step": 1993
},
{
"epoch": 0.5010994534145882,
"grad_norm": 0.3834688365459442,
"learning_rate": 5.0859517881460625e-05,
"loss": 2.7961,
"step": 1994
},
{
"epoch": 0.5013507570522083,
"grad_norm": 0.3446560800075531,
"learning_rate": 5.0819543973442404e-05,
"loss": 2.3877,
"step": 1995
},
{
"epoch": 0.5016020606898285,
"grad_norm": 0.4088892638683319,
"learning_rate": 5.0779569541452374e-05,
"loss": 2.0748,
"step": 1996
},
{
"epoch": 0.5018533643274486,
"grad_norm": 0.4269651770591736,
"learning_rate": 5.0739594611048004e-05,
"loss": 2.0912,
"step": 1997
},
{
"epoch": 0.5021046679650688,
"grad_norm": 0.27764561772346497,
"learning_rate": 5.0699619207787086e-05,
"loss": 2.3711,
"step": 1998
},
{
"epoch": 0.502355971602689,
"grad_norm": 0.3662841320037842,
"learning_rate": 5.0659643357227726e-05,
"loss": 2.3661,
"step": 1999
},
{
"epoch": 0.5026072752403091,
"grad_norm": 0.3839915990829468,
"learning_rate": 5.06196670849283e-05,
"loss": 2.2995,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 3979,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.970888917349206e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}