dqncode2-preview3 / checkpoint-1869 /trainer_state.json
DQN-Labs's picture
Upload folder using huggingface_hub
62137e3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9955911823647297,
"eval_steps": 500,
"global_step": 1869,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008016032064128256,
"grad_norm": 1.7998353242874146,
"learning_rate": 1.4035087719298246e-06,
"loss": 2.8148,
"step": 5
},
{
"epoch": 0.01603206412825651,
"grad_norm": 1.8694814443588257,
"learning_rate": 3.157894736842105e-06,
"loss": 2.7787,
"step": 10
},
{
"epoch": 0.02404809619238477,
"grad_norm": 2.0035526752471924,
"learning_rate": 4.912280701754386e-06,
"loss": 2.8341,
"step": 15
},
{
"epoch": 0.03206412825651302,
"grad_norm": 2.0097479820251465,
"learning_rate": 6.666666666666667e-06,
"loss": 2.7125,
"step": 20
},
{
"epoch": 0.04008016032064128,
"grad_norm": 1.6507200002670288,
"learning_rate": 8.421052631578948e-06,
"loss": 2.4992,
"step": 25
},
{
"epoch": 0.04809619238476954,
"grad_norm": 2.043926477432251,
"learning_rate": 1.017543859649123e-05,
"loss": 2.551,
"step": 30
},
{
"epoch": 0.056112224448897796,
"grad_norm": 1.697209119796753,
"learning_rate": 1.192982456140351e-05,
"loss": 2.525,
"step": 35
},
{
"epoch": 0.06412825651302605,
"grad_norm": 1.7798157930374146,
"learning_rate": 1.3684210526315791e-05,
"loss": 2.4075,
"step": 40
},
{
"epoch": 0.07214428857715431,
"grad_norm": 1.4598116874694824,
"learning_rate": 1.543859649122807e-05,
"loss": 2.093,
"step": 45
},
{
"epoch": 0.08016032064128256,
"grad_norm": 1.4842052459716797,
"learning_rate": 1.719298245614035e-05,
"loss": 1.9538,
"step": 50
},
{
"epoch": 0.08817635270541083,
"grad_norm": 1.1754084825515747,
"learning_rate": 1.894736842105263e-05,
"loss": 1.7687,
"step": 55
},
{
"epoch": 0.09619238476953908,
"grad_norm": 0.93027663230896,
"learning_rate": 1.999993988083788e-05,
"loss": 1.6448,
"step": 60
},
{
"epoch": 0.10420841683366733,
"grad_norm": 1.0409395694732666,
"learning_rate": 1.999926354856561e-05,
"loss": 1.4744,
"step": 65
},
{
"epoch": 0.11222444889779559,
"grad_norm": 0.9661508202552795,
"learning_rate": 1.999783578606323e-05,
"loss": 1.3396,
"step": 70
},
{
"epoch": 0.12024048096192384,
"grad_norm": 0.6014273166656494,
"learning_rate": 1.999565670062504e-05,
"loss": 1.2886,
"step": 75
},
{
"epoch": 0.1282565130260521,
"grad_norm": 0.5582892894744873,
"learning_rate": 1.9992726456006157e-05,
"loss": 1.1745,
"step": 80
},
{
"epoch": 0.13627254509018036,
"grad_norm": 0.4300733804702759,
"learning_rate": 1.9989045272410242e-05,
"loss": 1.163,
"step": 85
},
{
"epoch": 0.14428857715430862,
"grad_norm": 0.6262882351875305,
"learning_rate": 1.9984613426472934e-05,
"loss": 1.2483,
"step": 90
},
{
"epoch": 0.1523046092184369,
"grad_norm": 0.3891115188598633,
"learning_rate": 1.9979431251241057e-05,
"loss": 1.1076,
"step": 95
},
{
"epoch": 0.16032064128256512,
"grad_norm": 3.9172680377960205,
"learning_rate": 1.997349913614761e-05,
"loss": 1.1621,
"step": 100
},
{
"epoch": 0.1683366733466934,
"grad_norm": 0.3938579857349396,
"learning_rate": 1.9966817526982473e-05,
"loss": 1.1969,
"step": 105
},
{
"epoch": 0.17635270541082165,
"grad_norm": 0.4831806719303131,
"learning_rate": 1.9959386925858944e-05,
"loss": 1.1086,
"step": 110
},
{
"epoch": 0.1843687374749499,
"grad_norm": 0.3328356444835663,
"learning_rate": 1.9951207891175973e-05,
"loss": 1.0541,
"step": 115
},
{
"epoch": 0.19238476953907815,
"grad_norm": 0.5235080122947693,
"learning_rate": 1.9942281037576223e-05,
"loss": 1.0936,
"step": 120
},
{
"epoch": 0.20040080160320642,
"grad_norm": 0.3490229547023773,
"learning_rate": 1.993260703589986e-05,
"loss": 1.0498,
"step": 125
},
{
"epoch": 0.20841683366733466,
"grad_norm": 1.1637717485427856,
"learning_rate": 1.9922186613134152e-05,
"loss": 1.0424,
"step": 130
},
{
"epoch": 0.21643286573146292,
"grad_norm": 0.3831629157066345,
"learning_rate": 1.991102055235884e-05,
"loss": 1.105,
"step": 135
},
{
"epoch": 0.22444889779559118,
"grad_norm": 0.5610520243644714,
"learning_rate": 1.989910969268728e-05,
"loss": 1.1341,
"step": 140
},
{
"epoch": 0.23246492985971945,
"grad_norm": 0.5133630037307739,
"learning_rate": 1.9886454929203394e-05,
"loss": 1.0598,
"step": 145
},
{
"epoch": 0.24048096192384769,
"grad_norm": 0.3511698246002197,
"learning_rate": 1.98730572128944e-05,
"loss": 0.9851,
"step": 150
},
{
"epoch": 0.24849699398797595,
"grad_norm": 0.4384966790676117,
"learning_rate": 1.985891755057935e-05,
"loss": 1.0148,
"step": 155
},
{
"epoch": 0.2565130260521042,
"grad_norm": 0.5588643550872803,
"learning_rate": 1.984403700483347e-05,
"loss": 1.0541,
"step": 160
},
{
"epoch": 0.26452905811623245,
"grad_norm": 0.3638758063316345,
"learning_rate": 1.9828416693908305e-05,
"loss": 0.8998,
"step": 165
},
{
"epoch": 0.2725450901803607,
"grad_norm": 0.3875952959060669,
"learning_rate": 1.9812057791647687e-05,
"loss": 0.9024,
"step": 170
},
{
"epoch": 0.280561122244489,
"grad_norm": 0.3295404016971588,
"learning_rate": 1.9794961527399518e-05,
"loss": 0.9846,
"step": 175
},
{
"epoch": 0.28857715430861725,
"grad_norm": 0.3046537935733795,
"learning_rate": 1.9777129185923397e-05,
"loss": 0.9746,
"step": 180
},
{
"epoch": 0.2965931863727455,
"grad_norm": 0.41136643290519714,
"learning_rate": 1.975856210729405e-05,
"loss": 1.0259,
"step": 185
},
{
"epoch": 0.3046092184368738,
"grad_norm": 0.4319979250431061,
"learning_rate": 1.9739261686800662e-05,
"loss": 1.0129,
"step": 190
},
{
"epoch": 0.312625250501002,
"grad_norm": 0.45283690094947815,
"learning_rate": 1.9719229374841978e-05,
"loss": 0.9649,
"step": 195
},
{
"epoch": 0.32064128256513025,
"grad_norm": 0.2747628092765808,
"learning_rate": 1.9698466676817348e-05,
"loss": 0.9863,
"step": 200
},
{
"epoch": 0.3286573146292585,
"grad_norm": 1.5212770700454712,
"learning_rate": 1.9676975153013574e-05,
"loss": 0.9803,
"step": 205
},
{
"epoch": 0.3366733466933868,
"grad_norm": 0.48161494731903076,
"learning_rate": 1.965475641848767e-05,
"loss": 1.031,
"step": 210
},
{
"epoch": 0.34468937875751504,
"grad_norm": 0.31916359066963196,
"learning_rate": 1.9631812142945473e-05,
"loss": 1.0912,
"step": 215
},
{
"epoch": 0.3527054108216433,
"grad_norm": 0.418302059173584,
"learning_rate": 1.9608144050616192e-05,
"loss": 0.9857,
"step": 220
},
{
"epoch": 0.36072144288577157,
"grad_norm": 0.38761621713638306,
"learning_rate": 1.9583753920122822e-05,
"loss": 1.0441,
"step": 225
},
{
"epoch": 0.3687374749498998,
"grad_norm": 0.33479398488998413,
"learning_rate": 1.9558643584348478e-05,
"loss": 0.9453,
"step": 230
},
{
"epoch": 0.37675350701402804,
"grad_norm": 0.36575448513031006,
"learning_rate": 1.9532814930298673e-05,
"loss": 0.9196,
"step": 235
},
{
"epoch": 0.3847695390781563,
"grad_norm": 0.33325284719467163,
"learning_rate": 1.95062698989595e-05,
"loss": 1.0024,
"step": 240
},
{
"epoch": 0.3927855711422846,
"grad_norm": 0.3346043825149536,
"learning_rate": 1.947901048515176e-05,
"loss": 0.9017,
"step": 245
},
{
"epoch": 0.40080160320641284,
"grad_norm": 0.4760594666004181,
"learning_rate": 1.9451038737381078e-05,
"loss": 0.941,
"step": 250
},
{
"epoch": 0.4088176352705411,
"grad_norm": 0.3105100989341736,
"learning_rate": 1.9422356757683946e-05,
"loss": 0.9357,
"step": 255
},
{
"epoch": 0.4168336673346693,
"grad_norm": 0.35822534561157227,
"learning_rate": 1.939296670146976e-05,
"loss": 1.002,
"step": 260
},
{
"epoch": 0.4248496993987976,
"grad_norm": 0.4366324245929718,
"learning_rate": 1.936287077735884e-05,
"loss": 0.9836,
"step": 265
},
{
"epoch": 0.43286573146292584,
"grad_norm": 0.339522123336792,
"learning_rate": 1.9332071247016476e-05,
"loss": 0.8653,
"step": 270
},
{
"epoch": 0.4408817635270541,
"grad_norm": 0.31696006655693054,
"learning_rate": 1.930057042498293e-05,
"loss": 0.9843,
"step": 275
},
{
"epoch": 0.44889779559118237,
"grad_norm": 0.30729272961616516,
"learning_rate": 1.926837067849953e-05,
"loss": 1.0606,
"step": 280
},
{
"epoch": 0.45691382765531063,
"grad_norm": 0.4165307879447937,
"learning_rate": 1.9235474427330783e-05,
"loss": 0.9156,
"step": 285
},
{
"epoch": 0.4649298597194389,
"grad_norm": 0.43856576085090637,
"learning_rate": 1.9201884143582496e-05,
"loss": 0.9984,
"step": 290
},
{
"epoch": 0.4729458917835671,
"grad_norm": 0.333844393491745,
"learning_rate": 1.916760235151604e-05,
"loss": 0.9388,
"step": 295
},
{
"epoch": 0.48096192384769537,
"grad_norm": 0.34350189566612244,
"learning_rate": 1.913263162735862e-05,
"loss": 0.9525,
"step": 300
},
{
"epoch": 0.48897795591182364,
"grad_norm": 0.37638700008392334,
"learning_rate": 1.909697459910972e-05,
"loss": 0.9491,
"step": 305
},
{
"epoch": 0.4969939879759519,
"grad_norm": 0.37804660201072693,
"learning_rate": 1.906063394634356e-05,
"loss": 0.9613,
"step": 310
},
{
"epoch": 0.5050100200400801,
"grad_norm": 0.39323610067367554,
"learning_rate": 1.902361240000778e-05,
"loss": 0.9028,
"step": 315
},
{
"epoch": 0.5130260521042084,
"grad_norm": 0.3674936294555664,
"learning_rate": 1.8985912742218167e-05,
"loss": 1.0493,
"step": 320
},
{
"epoch": 0.5210420841683366,
"grad_norm": 0.432797908782959,
"learning_rate": 1.894753780604962e-05,
"loss": 1.0113,
"step": 325
},
{
"epoch": 0.5290581162324649,
"grad_norm": 0.33035048842430115,
"learning_rate": 1.8908490475323234e-05,
"loss": 0.9485,
"step": 330
},
{
"epoch": 0.5370741482965932,
"grad_norm": 0.4390115439891815,
"learning_rate": 1.886877368438957e-05,
"loss": 0.8867,
"step": 335
},
{
"epoch": 0.5450901803607214,
"grad_norm": 0.3174438178539276,
"learning_rate": 1.882839041790818e-05,
"loss": 0.8476,
"step": 340
},
{
"epoch": 0.5531062124248497,
"grad_norm": 0.35620829463005066,
"learning_rate": 1.8787343710623278e-05,
"loss": 0.9164,
"step": 345
},
{
"epoch": 0.561122244488978,
"grad_norm": 0.36334139108657837,
"learning_rate": 1.8745636647135693e-05,
"loss": 1.0097,
"step": 350
},
{
"epoch": 0.5691382765531062,
"grad_norm": 0.2904795706272125,
"learning_rate": 1.870327236167109e-05,
"loss": 0.849,
"step": 355
},
{
"epoch": 0.5771543086172345,
"grad_norm": 0.287715882062912,
"learning_rate": 1.866025403784439e-05,
"loss": 0.9578,
"step": 360
},
{
"epoch": 0.5851703406813628,
"grad_norm": 0.3586503565311432,
"learning_rate": 1.8616584908420573e-05,
"loss": 0.9323,
"step": 365
},
{
"epoch": 0.593186372745491,
"grad_norm": 0.8753472566604614,
"learning_rate": 1.8572268255071718e-05,
"loss": 0.903,
"step": 370
},
{
"epoch": 0.6012024048096193,
"grad_norm": 0.5796067118644714,
"learning_rate": 1.8527307408130388e-05,
"loss": 0.9665,
"step": 375
},
{
"epoch": 0.6092184368737475,
"grad_norm": 0.3357750177383423,
"learning_rate": 1.848170574633937e-05,
"loss": 0.8183,
"step": 380
},
{
"epoch": 0.6172344689378757,
"grad_norm": 0.3579539954662323,
"learning_rate": 1.8435466696597758e-05,
"loss": 0.9593,
"step": 385
},
{
"epoch": 0.625250501002004,
"grad_norm": 0.30244457721710205,
"learning_rate": 1.8388593733703428e-05,
"loss": 0.9102,
"step": 390
},
{
"epoch": 0.6332665330661322,
"grad_norm": 0.37754353880882263,
"learning_rate": 1.8341090380091926e-05,
"loss": 0.8921,
"step": 395
},
{
"epoch": 0.6412825651302605,
"grad_norm": 0.316244512796402,
"learning_rate": 1.8292960205571742e-05,
"loss": 0.8915,
"step": 400
},
{
"epoch": 0.6492985971943888,
"grad_norm": 0.3463846445083618,
"learning_rate": 1.824420682705606e-05,
"loss": 0.921,
"step": 405
},
{
"epoch": 0.657314629258517,
"grad_norm": 0.3335030674934387,
"learning_rate": 1.8194833908290933e-05,
"loss": 0.8892,
"step": 410
},
{
"epoch": 0.6653306613226453,
"grad_norm": 0.34191393852233887,
"learning_rate": 1.814484515957998e-05,
"loss": 0.8977,
"step": 415
},
{
"epoch": 0.6733466933867736,
"grad_norm": 0.3235023319721222,
"learning_rate": 1.809424433750555e-05,
"loss": 0.9092,
"step": 420
},
{
"epoch": 0.6813627254509018,
"grad_norm": 0.36189472675323486,
"learning_rate": 1.804303524464643e-05,
"loss": 0.8349,
"step": 425
},
{
"epoch": 0.6893787575150301,
"grad_norm": 0.3975241482257843,
"learning_rate": 1.799122172929206e-05,
"loss": 0.8988,
"step": 430
},
{
"epoch": 0.6973947895791583,
"grad_norm": 0.3317061960697174,
"learning_rate": 1.793880768515337e-05,
"loss": 1.0006,
"step": 435
},
{
"epoch": 0.7054108216432866,
"grad_norm": 0.380412220954895,
"learning_rate": 1.788579705107017e-05,
"loss": 0.8576,
"step": 440
},
{
"epoch": 0.7134268537074149,
"grad_norm": 0.3645264804363251,
"learning_rate": 1.7832193810715125e-05,
"loss": 1.0026,
"step": 445
},
{
"epoch": 0.7214428857715431,
"grad_norm": 0.3171364665031433,
"learning_rate": 1.7778001992294426e-05,
"loss": 0.8791,
"step": 450
},
{
"epoch": 0.7294589178356713,
"grad_norm": 0.3164263069629669,
"learning_rate": 1.772322566824504e-05,
"loss": 0.8095,
"step": 455
},
{
"epoch": 0.7374749498997996,
"grad_norm": 0.43192753195762634,
"learning_rate": 1.7667868954928695e-05,
"loss": 0.962,
"step": 460
},
{
"epoch": 0.7454909819639278,
"grad_norm": 0.3209589421749115,
"learning_rate": 1.761193601232254e-05,
"loss": 0.858,
"step": 465
},
{
"epoch": 0.7535070140280561,
"grad_norm": 0.3736160397529602,
"learning_rate": 1.7555431043706517e-05,
"loss": 1.0115,
"step": 470
},
{
"epoch": 0.7615230460921844,
"grad_norm": 0.36832720041275024,
"learning_rate": 1.74983582953475e-05,
"loss": 0.8106,
"step": 475
},
{
"epoch": 0.7695390781563126,
"grad_norm": 0.3430776596069336,
"learning_rate": 1.744072205618019e-05,
"loss": 0.9519,
"step": 480
},
{
"epoch": 0.7775551102204409,
"grad_norm": 0.3065486252307892,
"learning_rate": 1.7382526657484815e-05,
"loss": 0.8079,
"step": 485
},
{
"epoch": 0.7855711422845691,
"grad_norm": 0.33461305499076843,
"learning_rate": 1.7323776472561625e-05,
"loss": 0.8935,
"step": 490
},
{
"epoch": 0.7935871743486974,
"grad_norm": 0.4995952546596527,
"learning_rate": 1.7264475916402264e-05,
"loss": 0.885,
"step": 495
},
{
"epoch": 0.8016032064128257,
"grad_norm": 0.29805514216423035,
"learning_rate": 1.7204629445357978e-05,
"loss": 0.8196,
"step": 500
},
{
"epoch": 0.8096192384769539,
"grad_norm": 0.2961992621421814,
"learning_rate": 1.7144241556804724e-05,
"loss": 0.8429,
"step": 505
},
{
"epoch": 0.8176352705410822,
"grad_norm": 0.3210907280445099,
"learning_rate": 1.7083316788805212e-05,
"loss": 1.0272,
"step": 510
},
{
"epoch": 0.8256513026052105,
"grad_norm": 0.34204360842704773,
"learning_rate": 1.7021859719767855e-05,
"loss": 0.9828,
"step": 515
},
{
"epoch": 0.8336673346693386,
"grad_norm": 0.367639422416687,
"learning_rate": 1.6959874968102736e-05,
"loss": 1.0168,
"step": 520
},
{
"epoch": 0.8416833667334669,
"grad_norm": 0.393257200717926,
"learning_rate": 1.689736719187452e-05,
"loss": 0.8997,
"step": 525
},
{
"epoch": 0.8496993987975952,
"grad_norm": 0.35654187202453613,
"learning_rate": 1.683434108845241e-05,
"loss": 0.9833,
"step": 530
},
{
"epoch": 0.8577154308617234,
"grad_norm": 0.3392307460308075,
"learning_rate": 1.677080139415715e-05,
"loss": 0.8618,
"step": 535
},
{
"epoch": 0.8657314629258517,
"grad_norm": 0.4089815020561218,
"learning_rate": 1.6706752883905107e-05,
"loss": 0.8715,
"step": 540
},
{
"epoch": 0.87374749498998,
"grad_norm": 0.340652734041214,
"learning_rate": 1.6642200370849427e-05,
"loss": 0.9329,
"step": 545
},
{
"epoch": 0.8817635270541082,
"grad_norm": 0.3595132529735565,
"learning_rate": 1.657714870601833e-05,
"loss": 0.8755,
"step": 550
},
{
"epoch": 0.8897795591182365,
"grad_norm": 0.3543437421321869,
"learning_rate": 1.6511602777950585e-05,
"loss": 0.9098,
"step": 555
},
{
"epoch": 0.8977955911823647,
"grad_norm": 0.3310873508453369,
"learning_rate": 1.6445567512328122e-05,
"loss": 0.8433,
"step": 560
},
{
"epoch": 0.905811623246493,
"grad_norm": 0.3629891574382782,
"learning_rate": 1.6379047871605897e-05,
"loss": 0.9434,
"step": 565
},
{
"epoch": 0.9138276553106213,
"grad_norm": 0.3259185552597046,
"learning_rate": 1.6312048854638927e-05,
"loss": 0.8551,
"step": 570
},
{
"epoch": 0.9218436873747495,
"grad_norm": 0.4061657786369324,
"learning_rate": 1.6244575496306696e-05,
"loss": 0.8558,
"step": 575
},
{
"epoch": 0.9298597194388778,
"grad_norm": 0.28805726766586304,
"learning_rate": 1.6176632867134738e-05,
"loss": 0.8577,
"step": 580
},
{
"epoch": 0.9378757515030061,
"grad_norm": 1.7116715908050537,
"learning_rate": 1.610822607291361e-05,
"loss": 0.8773,
"step": 585
},
{
"epoch": 0.9458917835671342,
"grad_norm": 0.36534276604652405,
"learning_rate": 1.6039360254315213e-05,
"loss": 0.9772,
"step": 590
},
{
"epoch": 0.9539078156312625,
"grad_norm": 0.33474084734916687,
"learning_rate": 1.597004058650647e-05,
"loss": 0.8995,
"step": 595
},
{
"epoch": 0.9619238476953907,
"grad_norm": 0.3544546067714691,
"learning_rate": 1.5900272278760407e-05,
"loss": 0.8963,
"step": 600
},
{
"epoch": 0.969939879759519,
"grad_norm": 0.8063123822212219,
"learning_rate": 1.5830060574064698e-05,
"loss": 0.9184,
"step": 605
},
{
"epoch": 0.9779559118236473,
"grad_norm": 0.35473623871803284,
"learning_rate": 1.5759410748727663e-05,
"loss": 0.8378,
"step": 610
},
{
"epoch": 0.9859719438877755,
"grad_norm": 0.4922749996185303,
"learning_rate": 1.5688328111981747e-05,
"loss": 0.8228,
"step": 615
},
{
"epoch": 0.9939879759519038,
"grad_norm": 0.3475538492202759,
"learning_rate": 1.5616818005584554e-05,
"loss": 0.8499,
"step": 620
},
{
"epoch": 1.0016032064128257,
"grad_norm": 0.40670621395111084,
"learning_rate": 1.554488580341742e-05,
"loss": 0.9474,
"step": 625
},
{
"epoch": 1.009619238476954,
"grad_norm": 0.35096031427383423,
"learning_rate": 1.547253691108156e-05,
"loss": 0.9408,
"step": 630
},
{
"epoch": 1.0176352705410823,
"grad_norm": 0.477050244808197,
"learning_rate": 1.539977676549186e-05,
"loss": 0.8426,
"step": 635
},
{
"epoch": 1.0256513026052103,
"grad_norm": 0.3859562575817108,
"learning_rate": 1.532661083446829e-05,
"loss": 0.8802,
"step": 640
},
{
"epoch": 1.0336673346693386,
"grad_norm": 0.920563280582428,
"learning_rate": 1.5253044616325015e-05,
"loss": 0.8237,
"step": 645
},
{
"epoch": 1.0416833667334668,
"grad_norm": 0.3554023504257202,
"learning_rate": 1.5179083639457193e-05,
"loss": 0.8263,
"step": 650
},
{
"epoch": 1.049699398797595,
"grad_norm": 0.3896832764148712,
"learning_rate": 1.510473346192554e-05,
"loss": 0.9112,
"step": 655
},
{
"epoch": 1.0577154308617234,
"grad_norm": 0.39052242040634155,
"learning_rate": 1.5029999671038636e-05,
"loss": 0.7501,
"step": 660
},
{
"epoch": 1.0657314629258516,
"grad_norm": 0.9520488977432251,
"learning_rate": 1.4954887882933045e-05,
"loss": 0.8634,
"step": 665
},
{
"epoch": 1.07374749498998,
"grad_norm": 0.4311390817165375,
"learning_rate": 1.4879403742151283e-05,
"loss": 1.0027,
"step": 670
},
{
"epoch": 1.0817635270541082,
"grad_norm": 0.29344749450683594,
"learning_rate": 1.4803552921217636e-05,
"loss": 0.7963,
"step": 675
},
{
"epoch": 1.0897795591182364,
"grad_norm": 0.4940725266933441,
"learning_rate": 1.4727341120211869e-05,
"loss": 0.864,
"step": 680
},
{
"epoch": 1.0977955911823647,
"grad_norm": 0.36703604459762573,
"learning_rate": 1.4650774066340877e-05,
"loss": 0.8763,
"step": 685
},
{
"epoch": 1.105811623246493,
"grad_norm": 0.485721617937088,
"learning_rate": 1.4573857513508297e-05,
"loss": 0.8671,
"step": 690
},
{
"epoch": 1.1138276553106212,
"grad_norm": 0.3963873088359833,
"learning_rate": 1.4496597241882113e-05,
"loss": 0.8517,
"step": 695
},
{
"epoch": 1.1218436873747495,
"grad_norm": 0.329689085483551,
"learning_rate": 1.4418999057460277e-05,
"loss": 0.832,
"step": 700
},
{
"epoch": 1.1298597194388778,
"grad_norm": 0.4781339466571808,
"learning_rate": 1.4341068791634399e-05,
"loss": 0.9939,
"step": 705
},
{
"epoch": 1.137875751503006,
"grad_norm": 0.33180761337280273,
"learning_rate": 1.4262812300751528e-05,
"loss": 0.8397,
"step": 710
},
{
"epoch": 1.1458917835671343,
"grad_norm": 0.3313557505607605,
"learning_rate": 1.4184235465674055e-05,
"loss": 0.8507,
"step": 715
},
{
"epoch": 1.1539078156312625,
"grad_norm": 0.4401721954345703,
"learning_rate": 1.4105344191337783e-05,
"loss": 0.7881,
"step": 720
},
{
"epoch": 1.1619238476953908,
"grad_norm": 0.29335707426071167,
"learning_rate": 1.4026144406308155e-05,
"loss": 0.8678,
"step": 725
},
{
"epoch": 1.169939879759519,
"grad_norm": 0.2999337911605835,
"learning_rate": 1.3946642062334765e-05,
"loss": 0.8667,
"step": 730
},
{
"epoch": 1.1779559118236473,
"grad_norm": 0.2895663380622864,
"learning_rate": 1.3866843133904064e-05,
"loss": 0.8994,
"step": 735
},
{
"epoch": 1.1859719438877756,
"grad_norm": 0.4325989782810211,
"learning_rate": 1.3786753617790405e-05,
"loss": 0.8374,
"step": 740
},
{
"epoch": 1.1939879759519039,
"grad_norm": 0.4179665148258209,
"learning_rate": 1.3706379532605377e-05,
"loss": 0.9817,
"step": 745
},
{
"epoch": 1.2020040080160321,
"grad_norm": 0.3351852297782898,
"learning_rate": 1.362572691834553e-05,
"loss": 0.8651,
"step": 750
},
{
"epoch": 1.2100200400801604,
"grad_norm": 0.3361961245536804,
"learning_rate": 1.3544801835938466e-05,
"loss": 0.8969,
"step": 755
},
{
"epoch": 1.2180360721442887,
"grad_norm": 0.5419386029243469,
"learning_rate": 1.3463610366787392e-05,
"loss": 0.8676,
"step": 760
},
{
"epoch": 1.226052104208417,
"grad_norm": 4.783193111419678,
"learning_rate": 1.3382158612314075e-05,
"loss": 0.8724,
"step": 765
},
{
"epoch": 1.234068136272545,
"grad_norm": 0.3341674208641052,
"learning_rate": 1.3300452693500358e-05,
"loss": 0.8773,
"step": 770
},
{
"epoch": 1.2420841683366732,
"grad_norm": 0.3183631896972656,
"learning_rate": 1.3218498750428164e-05,
"loss": 0.9829,
"step": 775
},
{
"epoch": 1.2501002004008015,
"grad_norm": 0.30661240220069885,
"learning_rate": 1.3136302941818084e-05,
"loss": 0.8996,
"step": 780
},
{
"epoch": 1.2581162324649298,
"grad_norm": 0.31990957260131836,
"learning_rate": 1.3053871444566555e-05,
"loss": 0.8845,
"step": 785
},
{
"epoch": 1.266132264529058,
"grad_norm": 0.34186115860939026,
"learning_rate": 1.2971210453281675e-05,
"loss": 0.9203,
"step": 790
},
{
"epoch": 1.2741482965931863,
"grad_norm": 0.3045668601989746,
"learning_rate": 1.2888326179817686e-05,
"loss": 0.827,
"step": 795
},
{
"epoch": 1.2821643286573146,
"grad_norm": 0.34101662039756775,
"learning_rate": 1.2805224852808165e-05,
"loss": 0.8177,
"step": 800
},
{
"epoch": 1.2901803607214428,
"grad_norm": 0.3752049505710602,
"learning_rate": 1.2721912717197949e-05,
"loss": 0.797,
"step": 805
},
{
"epoch": 1.298196392785571,
"grad_norm": 0.35838568210601807,
"learning_rate": 1.2638396033773836e-05,
"loss": 0.8504,
"step": 810
},
{
"epoch": 1.3062124248496993,
"grad_norm": 0.4186570346355438,
"learning_rate": 1.2554681078694104e-05,
"loss": 0.9181,
"step": 815
},
{
"epoch": 1.3142284569138276,
"grad_norm": 0.3518775701522827,
"learning_rate": 1.2470774143016854e-05,
"loss": 0.8941,
"step": 820
},
{
"epoch": 1.3222444889779559,
"grad_norm": 0.37892553210258484,
"learning_rate": 1.238668153222725e-05,
"loss": 0.8469,
"step": 825
},
{
"epoch": 1.3302605210420841,
"grad_norm": 0.3626463711261749,
"learning_rate": 1.230240956576367e-05,
"loss": 0.9468,
"step": 830
},
{
"epoch": 1.3382765531062124,
"grad_norm": 0.3972296118736267,
"learning_rate": 1.2217964576542829e-05,
"loss": 0.9451,
"step": 835
},
{
"epoch": 1.3462925851703407,
"grad_norm": 0.3794783055782318,
"learning_rate": 1.2133352910483838e-05,
"loss": 0.8176,
"step": 840
},
{
"epoch": 1.354308617234469,
"grad_norm": 0.3861818313598633,
"learning_rate": 1.204858092603133e-05,
"loss": 0.7841,
"step": 845
},
{
"epoch": 1.3623246492985972,
"grad_norm": 0.30657151341438293,
"learning_rate": 1.1963654993677645e-05,
"loss": 0.7061,
"step": 850
},
{
"epoch": 1.3703406813627255,
"grad_norm": 0.338156521320343,
"learning_rate": 1.1878581495484074e-05,
"loss": 0.7851,
"step": 855
},
{
"epoch": 1.3783567134268537,
"grad_norm": 0.36862286925315857,
"learning_rate": 1.179336682460128e-05,
"loss": 1.0114,
"step": 860
},
{
"epoch": 1.386372745490982,
"grad_norm": 0.3428620398044586,
"learning_rate": 1.1708017384788842e-05,
"loss": 0.9138,
"step": 865
},
{
"epoch": 1.3943887775551103,
"grad_norm": 0.3690953254699707,
"learning_rate": 1.1622539589934027e-05,
"loss": 0.9095,
"step": 870
},
{
"epoch": 1.4024048096192385,
"grad_norm": 0.36460062861442566,
"learning_rate": 1.153693986356981e-05,
"loss": 0.8913,
"step": 875
},
{
"epoch": 1.4104208416833668,
"grad_norm": 0.33620065450668335,
"learning_rate": 1.145122463839213e-05,
"loss": 0.882,
"step": 880
},
{
"epoch": 1.418436873747495,
"grad_norm": 0.3633309304714203,
"learning_rate": 1.1365400355776504e-05,
"loss": 0.8295,
"step": 885
},
{
"epoch": 1.4264529058116233,
"grad_norm": 0.3613939881324768,
"learning_rate": 1.1279473465293953e-05,
"loss": 0.8322,
"step": 890
},
{
"epoch": 1.4344689378757516,
"grad_norm": 0.3712775409221649,
"learning_rate": 1.1193450424226333e-05,
"loss": 0.9109,
"step": 895
},
{
"epoch": 1.4424849699398798,
"grad_norm": 0.4387439787387848,
"learning_rate": 1.1107337697081079e-05,
"loss": 0.7593,
"step": 900
},
{
"epoch": 1.450501002004008,
"grad_norm": 0.3387526869773865,
"learning_rate": 1.1021141755105408e-05,
"loss": 0.834,
"step": 905
},
{
"epoch": 1.4585170340681364,
"grad_norm": 0.37399402260780334,
"learning_rate": 1.09348690758e-05,
"loss": 0.9176,
"step": 910
},
{
"epoch": 1.4665330661322646,
"grad_norm": 0.4290678799152374,
"learning_rate": 1.0848526142432252e-05,
"loss": 0.8756,
"step": 915
},
{
"epoch": 1.474549098196393,
"grad_norm": 0.3614942133426666,
"learning_rate": 1.0762119443549035e-05,
"loss": 0.8709,
"step": 920
},
{
"epoch": 1.4825651302605212,
"grad_norm": 0.3980346620082855,
"learning_rate": 1.0675655472489117e-05,
"loss": 0.9928,
"step": 925
},
{
"epoch": 1.4905811623246494,
"grad_norm": 4.1907243728637695,
"learning_rate": 1.0589140726895179e-05,
"loss": 0.7494,
"step": 930
},
{
"epoch": 1.4985971943887775,
"grad_norm": 0.4177466332912445,
"learning_rate": 1.0502581708225555e-05,
"loss": 0.8649,
"step": 935
},
{
"epoch": 1.506613226452906,
"grad_norm": 0.4919191300868988,
"learning_rate": 1.041598492126561e-05,
"loss": 0.8504,
"step": 940
},
{
"epoch": 1.5146292585170342,
"grad_norm": 0.33262577652931213,
"learning_rate": 1.0329356873638958e-05,
"loss": 0.8721,
"step": 945
},
{
"epoch": 1.5226452905811623,
"grad_norm": 0.3962733745574951,
"learning_rate": 1.0242704075318402e-05,
"loss": 0.9185,
"step": 950
},
{
"epoch": 1.5306613226452905,
"grad_norm": 0.4132327735424042,
"learning_rate": 1.0156033038136728e-05,
"loss": 0.8198,
"step": 955
},
{
"epoch": 1.5386773547094188,
"grad_norm": 0.36191555857658386,
"learning_rate": 1.0069350275297338e-05,
"loss": 0.8699,
"step": 960
},
{
"epoch": 1.546693386773547,
"grad_norm": 0.434069961309433,
"learning_rate": 9.982662300884813e-06,
"loss": 0.8678,
"step": 965
},
{
"epoch": 1.5547094188376753,
"grad_norm": 0.4045700430870056,
"learning_rate": 9.89597562937536e-06,
"loss": 0.8307,
"step": 970
},
{
"epoch": 1.5627254509018036,
"grad_norm": 0.31574735045433044,
"learning_rate": 9.809296775147287e-06,
"loss": 0.8058,
"step": 975
},
{
"epoch": 1.5707414829659319,
"grad_norm": 0.38601160049438477,
"learning_rate": 9.722632251991445e-06,
"loss": 0.8026,
"step": 980
},
{
"epoch": 1.5787575150300601,
"grad_norm": 0.3911404013633728,
"learning_rate": 9.635988572621716e-06,
"loss": 0.9165,
"step": 985
},
{
"epoch": 1.5867735470941884,
"grad_norm": 0.4252789318561554,
"learning_rate": 9.54937224818561e-06,
"loss": 0.9078,
"step": 990
},
{
"epoch": 1.5947895791583167,
"grad_norm": 0.32735535502433777,
"learning_rate": 9.462789787774944e-06,
"loss": 0.8508,
"step": 995
},
{
"epoch": 1.602805611222445,
"grad_norm": 0.3567563593387604,
"learning_rate": 9.376247697936719e-06,
"loss": 0.8884,
"step": 1000
},
{
"epoch": 1.6108216432865732,
"grad_norm": 0.3730451166629791,
"learning_rate": 9.289752482184128e-06,
"loss": 0.8712,
"step": 1005
},
{
"epoch": 1.6188376753507014,
"grad_norm": 0.34543997049331665,
"learning_rate": 9.20331064050785e-06,
"loss": 0.8948,
"step": 1010
},
{
"epoch": 1.6268537074148297,
"grad_norm": 0.3326113522052765,
"learning_rate": 9.116928668887587e-06,
"loss": 0.8879,
"step": 1015
},
{
"epoch": 1.6348697394789578,
"grad_norm": 0.33069467544555664,
"learning_rate": 9.03061305880388e-06,
"loss": 0.7994,
"step": 1020
},
{
"epoch": 1.642885771543086,
"grad_norm": 0.34401947259902954,
"learning_rate": 8.94437029675031e-06,
"loss": 0.9361,
"step": 1025
},
{
"epoch": 1.6509018036072143,
"grad_norm": 0.3548894226551056,
"learning_rate": 8.858206863746018e-06,
"loss": 0.8401,
"step": 1030
},
{
"epoch": 1.6589178356713425,
"grad_norm": 0.3403220474720001,
"learning_rate": 8.772129234848692e-06,
"loss": 0.7461,
"step": 1035
},
{
"epoch": 1.6669338677354708,
"grad_norm": 0.4974400997161865,
"learning_rate": 8.686143878667965e-06,
"loss": 0.8533,
"step": 1040
},
{
"epoch": 1.674949899799599,
"grad_norm": 0.4107082188129425,
"learning_rate": 8.600257256879306e-06,
"loss": 0.8885,
"step": 1045
},
{
"epoch": 1.6829659318637273,
"grad_norm": 0.3612527847290039,
"learning_rate": 8.514475823738431e-06,
"loss": 0.835,
"step": 1050
},
{
"epoch": 1.6909819639278556,
"grad_norm": 0.38753876090049744,
"learning_rate": 8.428806025596295e-06,
"loss": 0.9262,
"step": 1055
},
{
"epoch": 1.6989979959919839,
"grad_norm": 1.5978801250457764,
"learning_rate": 8.343254300414629e-06,
"loss": 0.8689,
"step": 1060
},
{
"epoch": 1.7070140280561121,
"grad_norm": 0.44398370385169983,
"learning_rate": 8.257827077282164e-06,
"loss": 0.9272,
"step": 1065
},
{
"epoch": 1.7150300601202404,
"grad_norm": 0.3783067464828491,
"learning_rate": 8.172530775931476e-06,
"loss": 0.8476,
"step": 1070
},
{
"epoch": 1.7230460921843687,
"grad_norm": 0.33193129301071167,
"learning_rate": 8.087371806256548e-06,
"loss": 0.8736,
"step": 1075
},
{
"epoch": 1.731062124248497,
"grad_norm": 0.4492955207824707,
"learning_rate": 8.002356567831104e-06,
"loss": 0.8475,
"step": 1080
},
{
"epoch": 1.7390781563126252,
"grad_norm": 0.3451695740222931,
"learning_rate": 7.917491449427664e-06,
"loss": 0.8555,
"step": 1085
},
{
"epoch": 1.7470941883767535,
"grad_norm": 0.34094515442848206,
"learning_rate": 7.832782828537437e-06,
"loss": 0.7514,
"step": 1090
},
{
"epoch": 1.7551102204408817,
"grad_norm": 0.4276813864707947,
"learning_rate": 7.748237070891085e-06,
"loss": 0.8078,
"step": 1095
},
{
"epoch": 1.76312625250501,
"grad_norm": 0.31638622283935547,
"learning_rate": 7.663860529980318e-06,
"loss": 0.8424,
"step": 1100
},
{
"epoch": 1.7711422845691382,
"grad_norm": 0.3871704339981079,
"learning_rate": 7.5796595465804616e-06,
"loss": 0.7848,
"step": 1105
},
{
"epoch": 1.7791583166332665,
"grad_norm": 0.4998142123222351,
"learning_rate": 7.495640448273947e-06,
"loss": 0.8611,
"step": 1110
},
{
"epoch": 1.7871743486973948,
"grad_norm": 0.35793057084083557,
"learning_rate": 7.411809548974792e-06,
"loss": 0.9746,
"step": 1115
},
{
"epoch": 1.795190380761523,
"grad_norm": 0.4235399067401886,
"learning_rate": 7.328173148454151e-06,
"loss": 0.9494,
"step": 1120
},
{
"epoch": 1.8032064128256513,
"grad_norm": 0.35276517271995544,
"learning_rate": 7.2447375318668545e-06,
"loss": 0.7732,
"step": 1125
},
{
"epoch": 1.8112224448897796,
"grad_norm": 0.2976204752922058,
"learning_rate": 7.1615089692791225e-06,
"loss": 0.8344,
"step": 1130
},
{
"epoch": 1.8192384769539078,
"grad_norm": 0.3903176784515381,
"learning_rate": 7.0784937151973666e-06,
"loss": 0.8048,
"step": 1135
},
{
"epoch": 1.827254509018036,
"grad_norm": 0.37830016016960144,
"learning_rate": 6.99569800809816e-06,
"loss": 0.8156,
"step": 1140
},
{
"epoch": 1.8352705410821644,
"grad_norm": 0.35692963004112244,
"learning_rate": 6.9131280699594545e-06,
"loss": 0.8472,
"step": 1145
},
{
"epoch": 1.8432865731462926,
"grad_norm": 0.35208597779273987,
"learning_rate": 6.8307901057929735e-06,
"loss": 0.792,
"step": 1150
},
{
"epoch": 1.851302605210421,
"grad_norm": 0.38317593932151794,
"learning_rate": 6.748690303177941e-06,
"loss": 0.8337,
"step": 1155
},
{
"epoch": 1.8593186372745492,
"grad_norm": 0.416199654340744,
"learning_rate": 6.66683483179608e-06,
"loss": 0.9125,
"step": 1160
},
{
"epoch": 1.8673346693386774,
"grad_norm": 0.4500935673713684,
"learning_rate": 6.585229842967977e-06,
"loss": 0.8482,
"step": 1165
},
{
"epoch": 1.8753507014028057,
"grad_norm": 0.4686901271343231,
"learning_rate": 6.5038814691908095e-06,
"loss": 0.8502,
"step": 1170
},
{
"epoch": 1.883366733466934,
"grad_norm": 0.41259345412254333,
"learning_rate": 6.422795823677515e-06,
"loss": 0.8229,
"step": 1175
},
{
"epoch": 1.8913827655310622,
"grad_norm": 0.3402479887008667,
"learning_rate": 6.3419789998973655e-06,
"loss": 0.787,
"step": 1180
},
{
"epoch": 1.8993987975951905,
"grad_norm": 0.34355294704437256,
"learning_rate": 6.261437071118086e-06,
"loss": 0.7658,
"step": 1185
},
{
"epoch": 1.9074148296593187,
"grad_norm": 0.3370957374572754,
"learning_rate": 6.1811760899494276e-06,
"loss": 0.8361,
"step": 1190
},
{
"epoch": 1.915430861723447,
"grad_norm": 0.4238271713256836,
"learning_rate": 6.101202087888329e-06,
"loss": 0.8249,
"step": 1195
},
{
"epoch": 1.9234468937875753,
"grad_norm": 0.41702085733413696,
"learning_rate": 6.0215210748656785e-06,
"loss": 0.8571,
"step": 1200
},
{
"epoch": 1.9314629258517035,
"grad_norm": 0.3196205794811249,
"learning_rate": 5.942139038794645e-06,
"loss": 0.803,
"step": 1205
},
{
"epoch": 1.9394789579158318,
"grad_norm": 0.3203709125518799,
"learning_rate": 5.863061945120719e-06,
"loss": 0.8299,
"step": 1210
},
{
"epoch": 1.94749498997996,
"grad_norm": 0.4405556917190552,
"learning_rate": 5.784295736373413e-06,
"loss": 0.7888,
"step": 1215
},
{
"epoch": 1.9555110220440883,
"grad_norm": 0.42714062333106995,
"learning_rate": 5.705846331719676e-06,
"loss": 0.9376,
"step": 1220
},
{
"epoch": 1.9635270541082166,
"grad_norm": 0.4327429234981537,
"learning_rate": 5.627719626519096e-06,
"loss": 0.7978,
"step": 1225
},
{
"epoch": 1.9715430861723446,
"grad_norm": 0.37963250279426575,
"learning_rate": 5.549921491880856e-06,
"loss": 0.8513,
"step": 1230
},
{
"epoch": 1.979559118236473,
"grad_norm": 0.4149099588394165,
"learning_rate": 5.472457774222535e-06,
"loss": 0.8542,
"step": 1235
},
{
"epoch": 1.9875751503006012,
"grad_norm": 0.34531357884407043,
"learning_rate": 5.395334294830766e-06,
"loss": 0.858,
"step": 1240
},
{
"epoch": 1.9955911823647294,
"grad_norm": 0.3555367887020111,
"learning_rate": 5.318556849423757e-06,
"loss": 0.7818,
"step": 1245
},
{
"epoch": 2.0032064128256515,
"grad_norm": 0.47876620292663574,
"learning_rate": 5.242131207715768e-06,
"loss": 0.9044,
"step": 1250
},
{
"epoch": 2.0112224448897797,
"grad_norm": 0.3445394039154053,
"learning_rate": 5.166063112983522e-06,
"loss": 0.8851,
"step": 1255
},
{
"epoch": 2.019238476953908,
"grad_norm": 0.34241387248039246,
"learning_rate": 5.090358281634594e-06,
"loss": 0.7266,
"step": 1260
},
{
"epoch": 2.0272545090180363,
"grad_norm": 0.3980085253715515,
"learning_rate": 5.015022402777838e-06,
"loss": 0.8228,
"step": 1265
},
{
"epoch": 2.0352705410821645,
"grad_norm": 0.3420424163341522,
"learning_rate": 4.940061137795876e-06,
"loss": 0.7685,
"step": 1270
},
{
"epoch": 2.0432865731462924,
"grad_norm": 0.41949328780174255,
"learning_rate": 4.8654801199196176e-06,
"loss": 0.9227,
"step": 1275
},
{
"epoch": 2.0513026052104206,
"grad_norm": 0.4226691722869873,
"learning_rate": 4.791284953804969e-06,
"loss": 0.7992,
"step": 1280
},
{
"epoch": 2.059318637274549,
"grad_norm": 0.39590245485305786,
"learning_rate": 4.717481215111622e-06,
"loss": 0.8595,
"step": 1285
},
{
"epoch": 2.067334669338677,
"grad_norm": 0.4448534846305847,
"learning_rate": 4.644074450084061e-06,
"loss": 0.9019,
"step": 1290
},
{
"epoch": 2.0753507014028054,
"grad_norm": 0.3919767737388611,
"learning_rate": 4.571070175134781e-06,
"loss": 0.8359,
"step": 1295
},
{
"epoch": 2.0833667334669337,
"grad_norm": 0.4835250675678253,
"learning_rate": 4.498473876429727e-06,
"loss": 0.8491,
"step": 1300
},
{
"epoch": 2.091382765531062,
"grad_norm": 0.45917949080467224,
"learning_rate": 4.426291009476007e-06,
"loss": 0.7978,
"step": 1305
},
{
"epoch": 2.09939879759519,
"grad_norm": 0.30657947063446045,
"learning_rate": 4.354526998711945e-06,
"loss": 0.8031,
"step": 1310
},
{
"epoch": 2.1074148296593185,
"grad_norm": 0.4451172649860382,
"learning_rate": 4.283187237099412e-06,
"loss": 0.8898,
"step": 1315
},
{
"epoch": 2.1154308617234467,
"grad_norm": 0.4376426637172699,
"learning_rate": 4.2122770857185805e-06,
"loss": 0.8588,
"step": 1320
},
{
"epoch": 2.123446893787575,
"grad_norm": 0.3958570957183838,
"learning_rate": 4.141801873365023e-06,
"loss": 0.7816,
"step": 1325
},
{
"epoch": 2.1314629258517033,
"grad_norm": 0.42820340394973755,
"learning_rate": 4.0717668961492725e-06,
"loss": 0.845,
"step": 1330
},
{
"epoch": 2.1394789579158315,
"grad_norm": 0.4091578423976898,
"learning_rate": 4.0021774170988395e-06,
"loss": 0.8996,
"step": 1335
},
{
"epoch": 2.14749498997996,
"grad_norm": 0.44767269492149353,
"learning_rate": 3.9330386657626696e-06,
"loss": 0.7196,
"step": 1340
},
{
"epoch": 2.155511022044088,
"grad_norm": 0.47936999797821045,
"learning_rate": 3.864355837818188e-06,
"loss": 0.7673,
"step": 1345
},
{
"epoch": 2.1635270541082163,
"grad_norm": 0.3325825035572052,
"learning_rate": 3.79613409468083e-06,
"loss": 0.7788,
"step": 1350
},
{
"epoch": 2.1715430861723446,
"grad_norm": 0.36496493220329285,
"learning_rate": 3.7283785631161663e-06,
"loss": 0.8764,
"step": 1355
},
{
"epoch": 2.179559118236473,
"grad_norm": 0.37848904728889465,
"learning_rate": 3.6610943348546524e-06,
"loss": 0.8016,
"step": 1360
},
{
"epoch": 2.187575150300601,
"grad_norm": 0.3615955412387848,
"learning_rate": 3.5942864662089684e-06,
"loss": 0.9018,
"step": 1365
},
{
"epoch": 2.1955911823647294,
"grad_norm": 0.45091915130615234,
"learning_rate": 3.527959977694061e-06,
"loss": 0.8454,
"step": 1370
},
{
"epoch": 2.2036072144288577,
"grad_norm": 0.4201883375644684,
"learning_rate": 3.462119853649859e-06,
"loss": 0.9691,
"step": 1375
},
{
"epoch": 2.211623246492986,
"grad_norm": 0.3573856055736542,
"learning_rate": 3.3967710418666986e-06,
"loss": 0.8465,
"step": 1380
},
{
"epoch": 2.219639278557114,
"grad_norm": 0.447793185710907,
"learning_rate": 3.331918453213505e-06,
"loss": 0.9212,
"step": 1385
},
{
"epoch": 2.2276553106212424,
"grad_norm": 0.4908188581466675,
"learning_rate": 3.2675669612687565e-06,
"loss": 0.8528,
"step": 1390
},
{
"epoch": 2.2356713426853707,
"grad_norm": 0.44964873790740967,
"learning_rate": 3.203721401954242e-06,
"loss": 0.9147,
"step": 1395
},
{
"epoch": 2.243687374749499,
"grad_norm": 0.4697072207927704,
"learning_rate": 3.1403865731716266e-06,
"loss": 0.8588,
"step": 1400
},
{
"epoch": 2.2517034068136272,
"grad_norm": 0.36253607273101807,
"learning_rate": 3.0775672344419305e-06,
"loss": 0.8024,
"step": 1405
},
{
"epoch": 2.2597194388777555,
"grad_norm": 0.5492545366287231,
"learning_rate": 3.0152681065478252e-06,
"loss": 0.8588,
"step": 1410
},
{
"epoch": 2.2677354709418838,
"grad_norm": 0.35868972539901733,
"learning_rate": 2.953493871178902e-06,
"loss": 0.8138,
"step": 1415
},
{
"epoch": 2.275751503006012,
"grad_norm": 0.38911786675453186,
"learning_rate": 2.892249170579826e-06,
"loss": 0.7942,
"step": 1420
},
{
"epoch": 2.2837675350701403,
"grad_norm": 0.5678386092185974,
"learning_rate": 2.8315386072014883e-06,
"loss": 0.7764,
"step": 1425
},
{
"epoch": 2.2917835671342686,
"grad_norm": 0.36749324202537537,
"learning_rate": 2.7713667433551495e-06,
"loss": 0.8247,
"step": 1430
},
{
"epoch": 2.299799599198397,
"grad_norm": 0.4348406493663788,
"learning_rate": 2.711738100869563e-06,
"loss": 0.9319,
"step": 1435
},
{
"epoch": 2.307815631262525,
"grad_norm": 0.4162181615829468,
"learning_rate": 2.652657160751193e-06,
"loss": 0.9147,
"step": 1440
},
{
"epoch": 2.3158316633266534,
"grad_norm": 0.36371198296546936,
"learning_rate": 2.59412836284745e-06,
"loss": 0.8078,
"step": 1445
},
{
"epoch": 2.3238476953907816,
"grad_norm": 0.39601725339889526,
"learning_rate": 2.5361561055130625e-06,
"loss": 0.8824,
"step": 1450
},
{
"epoch": 2.33186372745491,
"grad_norm": 0.34811967611312866,
"learning_rate": 2.4787447452795366e-06,
"loss": 0.8163,
"step": 1455
},
{
"epoch": 2.339879759519038,
"grad_norm": 0.405057430267334,
"learning_rate": 2.4218985965277676e-06,
"loss": 0.8474,
"step": 1460
},
{
"epoch": 2.3478957915831664,
"grad_norm": 0.3901906907558441,
"learning_rate": 2.3656219311638194e-06,
"loss": 0.771,
"step": 1465
},
{
"epoch": 2.3559118236472947,
"grad_norm": 0.3617786467075348,
"learning_rate": 2.3099189782979126e-06,
"loss": 0.8022,
"step": 1470
},
{
"epoch": 2.363927855711423,
"grad_norm": 0.3886442184448242,
"learning_rate": 2.2547939239265893e-06,
"loss": 0.8938,
"step": 1475
},
{
"epoch": 2.371943887775551,
"grad_norm": 0.4365661144256592,
"learning_rate": 2.2002509106181625e-06,
"loss": 0.8959,
"step": 1480
},
{
"epoch": 2.3799599198396795,
"grad_norm": 0.3170413374900818,
"learning_rate": 2.146294037201394e-06,
"loss": 0.7777,
"step": 1485
},
{
"epoch": 2.3879759519038077,
"grad_norm": 0.43696296215057373,
"learning_rate": 2.092927358457476e-06,
"loss": 0.8724,
"step": 1490
},
{
"epoch": 2.395991983967936,
"grad_norm": 0.38862401247024536,
"learning_rate": 2.0401548848153296e-06,
"loss": 0.8958,
"step": 1495
},
{
"epoch": 2.4040080160320643,
"grad_norm": 0.4901878535747528,
"learning_rate": 1.9879805820502176e-06,
"loss": 0.903,
"step": 1500
},
{
"epoch": 2.4120240480961925,
"grad_norm": 0.37639907002449036,
"learning_rate": 1.9364083709857184e-06,
"loss": 0.7387,
"step": 1505
},
{
"epoch": 2.420040080160321,
"grad_norm": 0.34562456607818604,
"learning_rate": 1.8854421271990964e-06,
"loss": 0.7325,
"step": 1510
},
{
"epoch": 2.428056112224449,
"grad_norm": 0.44138360023498535,
"learning_rate": 1.835085680730041e-06,
"loss": 0.8788,
"step": 1515
},
{
"epoch": 2.4360721442885773,
"grad_norm": 0.3983897864818573,
"learning_rate": 1.785342815792862e-06,
"loss": 0.8522,
"step": 1520
},
{
"epoch": 2.4440881763527056,
"grad_norm": 0.40477579832077026,
"learning_rate": 1.7362172704920933e-06,
"loss": 0.7911,
"step": 1525
},
{
"epoch": 2.452104208416834,
"grad_norm": 0.4484027624130249,
"learning_rate": 1.6877127365415924e-06,
"loss": 0.8903,
"step": 1530
},
{
"epoch": 2.460120240480962,
"grad_norm": 0.3873910903930664,
"learning_rate": 1.6398328589871126e-06,
"loss": 0.8499,
"step": 1535
},
{
"epoch": 2.46813627254509,
"grad_norm": 0.4115862548351288,
"learning_rate": 1.5925812359323745e-06,
"loss": 0.9026,
"step": 1540
},
{
"epoch": 2.4761523046092186,
"grad_norm": 0.3982996344566345,
"learning_rate": 1.5459614182686866e-06,
"loss": 0.8284,
"step": 1545
},
{
"epoch": 2.4841683366733465,
"grad_norm": 0.3346659541130066,
"learning_rate": 1.4999769094080853e-06,
"loss": 0.8033,
"step": 1550
},
{
"epoch": 2.492184368737475,
"grad_norm": 0.3832646310329437,
"learning_rate": 1.454631165020075e-06,
"loss": 0.8919,
"step": 1555
},
{
"epoch": 2.500200400801603,
"grad_norm": 0.7681372761726379,
"learning_rate": 1.4099275927719235e-06,
"loss": 0.8321,
"step": 1560
},
{
"epoch": 2.5082164328657317,
"grad_norm": 0.415099173784256,
"learning_rate": 1.3658695520725984e-06,
"loss": 0.8178,
"step": 1565
},
{
"epoch": 2.5162324649298595,
"grad_norm": 0.3748820722103119,
"learning_rate": 1.3224603538202929e-06,
"loss": 0.8834,
"step": 1570
},
{
"epoch": 2.5242484969939882,
"grad_norm": 0.37983429431915283,
"learning_rate": 1.2797032601536342e-06,
"loss": 0.8233,
"step": 1575
},
{
"epoch": 2.532264529058116,
"grad_norm": 0.40843164920806885,
"learning_rate": 1.2376014842065264e-06,
"loss": 0.9251,
"step": 1580
},
{
"epoch": 2.5402805611222448,
"grad_norm": 0.34796783328056335,
"learning_rate": 1.1961581898666895e-06,
"loss": 0.8777,
"step": 1585
},
{
"epoch": 2.5482965931863726,
"grad_norm": 0.3763981759548187,
"learning_rate": 1.1553764915379095e-06,
"loss": 0.8709,
"step": 1590
},
{
"epoch": 2.556312625250501,
"grad_norm": 0.7970382571220398,
"learning_rate": 1.115259453905978e-06,
"loss": 0.8451,
"step": 1595
},
{
"epoch": 2.564328657314629,
"grad_norm": 0.3026469051837921,
"learning_rate": 1.075810091708399e-06,
"loss": 0.8302,
"step": 1600
},
{
"epoch": 2.5723446893787574,
"grad_norm": 0.3812445104122162,
"learning_rate": 1.0370313695078316e-06,
"loss": 0.8251,
"step": 1605
},
{
"epoch": 2.5803607214428856,
"grad_norm": 0.36716336011886597,
"learning_rate": 9.989262014693013e-07,
"loss": 0.8475,
"step": 1610
},
{
"epoch": 2.588376753507014,
"grad_norm": 0.4053398072719574,
"learning_rate": 9.614974511412156e-07,
"loss": 0.8174,
"step": 1615
},
{
"epoch": 2.596392785571142,
"grad_norm": 0.4070420563220978,
"learning_rate": 9.247479312401642e-07,
"loss": 0.8769,
"step": 1620
},
{
"epoch": 2.6044088176352704,
"grad_norm": 0.7767839431762695,
"learning_rate": 8.88680403439548e-07,
"loss": 0.8343,
"step": 1625
},
{
"epoch": 2.6124248496993987,
"grad_norm": 0.411374568939209,
"learning_rate": 8.532975781620511e-07,
"loss": 0.8021,
"step": 1630
},
{
"epoch": 2.620440881763527,
"grad_norm": 0.32186996936798096,
"learning_rate": 8.18602114375947e-07,
"loss": 0.8267,
"step": 1635
},
{
"epoch": 2.6284569138276552,
"grad_norm": 0.34994935989379883,
"learning_rate": 7.845966193952825e-07,
"loss": 0.8122,
"step": 1640
},
{
"epoch": 2.6364729458917835,
"grad_norm": 0.399689644575119,
"learning_rate": 7.512836486839492e-07,
"loss": 0.7816,
"step": 1645
},
{
"epoch": 2.6444889779559118,
"grad_norm": 0.3521658778190613,
"learning_rate": 7.18665705663637e-07,
"loss": 0.8229,
"step": 1650
},
{
"epoch": 2.65250501002004,
"grad_norm": 0.46236297488212585,
"learning_rate": 6.867452415257081e-07,
"loss": 0.7963,
"step": 1655
},
{
"epoch": 2.6605210420841683,
"grad_norm": 0.47568464279174805,
"learning_rate": 6.555246550469907e-07,
"loss": 0.7728,
"step": 1660
},
{
"epoch": 2.6685370741482966,
"grad_norm": 0.3366374373435974,
"learning_rate": 6.250062924095158e-07,
"loss": 0.8544,
"step": 1665
},
{
"epoch": 2.676553106212425,
"grad_norm": 0.3431518077850342,
"learning_rate": 5.951924470242121e-07,
"loss": 0.6777,
"step": 1670
},
{
"epoch": 2.684569138276553,
"grad_norm": 0.3027094602584839,
"learning_rate": 5.660853593585458e-07,
"loss": 0.7997,
"step": 1675
},
{
"epoch": 2.6925851703406813,
"grad_norm": 0.38628339767456055,
"learning_rate": 5.376872167681634e-07,
"loss": 0.8907,
"step": 1680
},
{
"epoch": 2.7006012024048096,
"grad_norm": 0.38656118512153625,
"learning_rate": 5.10000153332515e-07,
"loss": 0.8128,
"step": 1685
},
{
"epoch": 2.708617234468938,
"grad_norm": 0.3081871569156647,
"learning_rate": 4.830262496944693e-07,
"loss": 0.8044,
"step": 1690
},
{
"epoch": 2.716633266533066,
"grad_norm": 0.3895088732242584,
"learning_rate": 4.5676753290397445e-07,
"loss": 0.8063,
"step": 1695
},
{
"epoch": 2.7246492985971944,
"grad_norm": 0.3495742082595825,
"learning_rate": 4.312259762657145e-07,
"loss": 0.8404,
"step": 1700
},
{
"epoch": 2.7326653306613227,
"grad_norm": 0.3835850954055786,
"learning_rate": 4.0640349919082056e-07,
"loss": 0.9816,
"step": 1705
},
{
"epoch": 2.740681362725451,
"grad_norm": 0.4039825201034546,
"learning_rate": 3.8230196705263734e-07,
"loss": 0.936,
"step": 1710
},
{
"epoch": 2.748697394789579,
"grad_norm": 0.5234516263008118,
"learning_rate": 3.5892319104653294e-07,
"loss": 0.831,
"step": 1715
},
{
"epoch": 2.7567134268537075,
"grad_norm": 0.3361075520515442,
"learning_rate": 3.3626892805379565e-07,
"loss": 0.8202,
"step": 1720
},
{
"epoch": 2.7647294589178357,
"grad_norm": 0.3541311025619507,
"learning_rate": 3.1434088050960934e-07,
"loss": 0.8253,
"step": 1725
},
{
"epoch": 2.772745490981964,
"grad_norm": 0.36018094420433044,
"learning_rate": 2.9314069627511045e-07,
"loss": 0.8884,
"step": 1730
},
{
"epoch": 2.7807615230460923,
"grad_norm": 0.47873905301094055,
"learning_rate": 2.726699685135603e-07,
"loss": 0.8518,
"step": 1735
},
{
"epoch": 2.7887775551102205,
"grad_norm": 0.4113430380821228,
"learning_rate": 2.529302355706165e-07,
"loss": 0.859,
"step": 1740
},
{
"epoch": 2.796793587174349,
"grad_norm": 0.43893691897392273,
"learning_rate": 2.3392298085873288e-07,
"loss": 0.8878,
"step": 1745
},
{
"epoch": 2.804809619238477,
"grad_norm": 0.30622348189353943,
"learning_rate": 2.1564963274568028e-07,
"loss": 0.8205,
"step": 1750
},
{
"epoch": 2.8128256513026053,
"grad_norm": 1.027692198753357,
"learning_rate": 1.9811156444720648e-07,
"loss": 0.8271,
"step": 1755
},
{
"epoch": 2.8208416833667336,
"grad_norm": 0.3843301832675934,
"learning_rate": 1.8131009392384324e-07,
"loss": 0.9512,
"step": 1760
},
{
"epoch": 2.828857715430862,
"grad_norm": 0.4416702091693878,
"learning_rate": 1.6524648378186125e-07,
"loss": 0.8444,
"step": 1765
},
{
"epoch": 2.83687374749499,
"grad_norm": 0.34150800108909607,
"learning_rate": 1.49921941178387e-07,
"loss": 0.8285,
"step": 1770
},
{
"epoch": 2.8448897795591184,
"grad_norm": 0.4354546368122101,
"learning_rate": 1.35337617730692e-07,
"loss": 0.8678,
"step": 1775
},
{
"epoch": 2.8529058116232466,
"grad_norm": 0.5867286920547485,
"learning_rate": 1.2149460942964097e-07,
"loss": 0.9001,
"step": 1780
},
{
"epoch": 2.860921843687375,
"grad_norm": 0.40360739827156067,
"learning_rate": 1.0839395655733664e-07,
"loss": 0.7915,
"step": 1785
},
{
"epoch": 2.868937875751503,
"grad_norm": 0.4062676727771759,
"learning_rate": 9.603664360894327e-08,
"loss": 0.9189,
"step": 1790
},
{
"epoch": 2.8769539078156314,
"grad_norm": 0.36837005615234375,
"learning_rate": 8.442359921870148e-08,
"loss": 0.82,
"step": 1795
},
{
"epoch": 2.8849699398797597,
"grad_norm": 0.34926676750183105,
"learning_rate": 7.35556960901429e-08,
"loss": 0.7414,
"step": 1800
},
{
"epoch": 2.8929859719438875,
"grad_norm": 0.36959943175315857,
"learning_rate": 6.343375093050941e-08,
"loss": 0.8122,
"step": 1805
},
{
"epoch": 2.901002004008016,
"grad_norm": 0.3383861184120178,
"learning_rate": 5.405852438937764e-08,
"loss": 0.9356,
"step": 1810
},
{
"epoch": 2.909018036072144,
"grad_norm": 0.3275286853313446,
"learning_rate": 4.543072100149704e-08,
"loss": 0.9095,
"step": 1815
},
{
"epoch": 2.9170340681362728,
"grad_norm": 0.35631418228149414,
"learning_rate": 3.755098913384325e-08,
"loss": 0.8541,
"step": 1820
},
{
"epoch": 2.9250501002004006,
"grad_norm": 0.33556675910949707,
"learning_rate": 3.0419920936900494e-08,
"loss": 0.7787,
"step": 1825
},
{
"epoch": 2.9330661322645293,
"grad_norm": 0.3943828046321869,
"learning_rate": 2.403805230015488e-08,
"loss": 0.937,
"step": 1830
},
{
"epoch": 2.941082164328657,
"grad_norm": 0.37884265184402466,
"learning_rate": 1.840586281182888e-08,
"loss": 0.7455,
"step": 1835
},
{
"epoch": 2.949098196392786,
"grad_norm": 0.4555797576904297,
"learning_rate": 1.3523775722834586e-08,
"loss": 0.8243,
"step": 1840
},
{
"epoch": 2.9571142284569136,
"grad_norm": 0.392288476228714,
"learning_rate": 9.39215791497583e-09,
"loss": 0.8212,
"step": 1845
},
{
"epoch": 2.9651302605210423,
"grad_norm": 0.4135635495185852,
"learning_rate": 6.011319873370225e-09,
"loss": 0.9677,
"step": 1850
},
{
"epoch": 2.97314629258517,
"grad_norm": 0.3438234329223633,
"learning_rate": 3.3815156631178404e-09,
"loss": 0.8026,
"step": 1855
},
{
"epoch": 2.981162324649299,
"grad_norm": 0.3370349109172821,
"learning_rate": 1.502942910212024e-09,
"loss": 0.812,
"step": 1860
},
{
"epoch": 2.9891783567134267,
"grad_norm": 0.3954819142818451,
"learning_rate": 3.757427866846186e-10,
"loss": 0.8795,
"step": 1865
}
],
"logging_steps": 5,
"max_steps": 1869,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.163374755613706e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}