mid_result / trainer_state.json
joygoround's picture
Upload folder using huggingface_hub
c417e7c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15362377537410882,
"eval_steps": 501,
"global_step": 5500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.793159552256524e-05,
"grad_norm": 37.485023498535156,
"learning_rate": 2e-06,
"loss": 2.1306,
"step": 1
},
{
"epoch": 0.00027931595522565235,
"grad_norm": 27.45977210998535,
"learning_rate": 1.9994972347913524e-06,
"loss": 2.0875,
"step": 10
},
{
"epoch": 0.0005586319104513047,
"grad_norm": 18.15777015686035,
"learning_rate": 1.998938606781744e-06,
"loss": 1.9196,
"step": 20
},
{
"epoch": 0.0008379478656769571,
"grad_norm": 20.21579360961914,
"learning_rate": 1.9983799787721355e-06,
"loss": 1.8183,
"step": 30
},
{
"epoch": 0.0011172638209026094,
"grad_norm": 14.037498474121094,
"learning_rate": 1.9978213507625273e-06,
"loss": 1.7441,
"step": 40
},
{
"epoch": 0.001396579776128262,
"grad_norm": 13.131041526794434,
"learning_rate": 1.9972627227529187e-06,
"loss": 1.6705,
"step": 50
},
{
"epoch": 0.0016758957313539142,
"grad_norm": 14.493760108947754,
"learning_rate": 1.99670409474331e-06,
"loss": 1.7261,
"step": 60
},
{
"epoch": 0.0019552116865795667,
"grad_norm": 15.630404472351074,
"learning_rate": 1.996145466733702e-06,
"loss": 1.7375,
"step": 70
},
{
"epoch": 0.002234527641805219,
"grad_norm": 12.822752952575684,
"learning_rate": 1.9955868387240936e-06,
"loss": 1.6683,
"step": 80
},
{
"epoch": 0.0025138435970308713,
"grad_norm": 12.364542961120605,
"learning_rate": 1.995028210714485e-06,
"loss": 1.5954,
"step": 90
},
{
"epoch": 0.002793159552256524,
"grad_norm": 14.236380577087402,
"learning_rate": 1.994469582704877e-06,
"loss": 1.6029,
"step": 100
},
{
"epoch": 0.0030724755074821764,
"grad_norm": 13.032842636108398,
"learning_rate": 1.9939109546952686e-06,
"loss": 1.6026,
"step": 110
},
{
"epoch": 0.0033517914627078284,
"grad_norm": 14.683618545532227,
"learning_rate": 1.99335232668566e-06,
"loss": 1.6354,
"step": 120
},
{
"epoch": 0.003631107417933481,
"grad_norm": 12.228134155273438,
"learning_rate": 1.9927936986760517e-06,
"loss": 1.5176,
"step": 130
},
{
"epoch": 0.0039104233731591335,
"grad_norm": 14.089844703674316,
"learning_rate": 1.992235070666443e-06,
"loss": 1.5644,
"step": 140
},
{
"epoch": 0.004189739328384786,
"grad_norm": 13.192647933959961,
"learning_rate": 1.9916764426568345e-06,
"loss": 1.6016,
"step": 150
},
{
"epoch": 0.004469055283610438,
"grad_norm": 13.630595207214355,
"learning_rate": 1.9911178146472263e-06,
"loss": 1.5195,
"step": 160
},
{
"epoch": 0.00474837123883609,
"grad_norm": 12.795489311218262,
"learning_rate": 1.990559186637618e-06,
"loss": 1.4888,
"step": 170
},
{
"epoch": 0.005027687194061743,
"grad_norm": 14.711332321166992,
"learning_rate": 1.9900005586280094e-06,
"loss": 1.4658,
"step": 180
},
{
"epoch": 0.005307003149287395,
"grad_norm": 14.496808052062988,
"learning_rate": 1.989441930618401e-06,
"loss": 1.5231,
"step": 190
},
{
"epoch": 0.005586319104513048,
"grad_norm": 11.26836109161377,
"learning_rate": 1.988883302608793e-06,
"loss": 1.3865,
"step": 200
},
{
"epoch": 0.0058656350597387,
"grad_norm": 12.824557304382324,
"learning_rate": 1.9883246745991844e-06,
"loss": 1.4743,
"step": 210
},
{
"epoch": 0.006144951014964353,
"grad_norm": 13.50056266784668,
"learning_rate": 1.9877660465895757e-06,
"loss": 1.3856,
"step": 220
},
{
"epoch": 0.006424266970190004,
"grad_norm": 12.35004997253418,
"learning_rate": 1.9872074185799675e-06,
"loss": 1.4165,
"step": 230
},
{
"epoch": 0.006703582925415657,
"grad_norm": 11.969117164611816,
"learning_rate": 1.986648790570359e-06,
"loss": 1.3719,
"step": 240
},
{
"epoch": 0.006982898880641309,
"grad_norm": 12.795903205871582,
"learning_rate": 1.9860901625607507e-06,
"loss": 1.4586,
"step": 250
},
{
"epoch": 0.007262214835866962,
"grad_norm": 14.326574325561523,
"learning_rate": 1.9855315345511424e-06,
"loss": 1.4185,
"step": 260
},
{
"epoch": 0.0075415307910926144,
"grad_norm": 14.065360069274902,
"learning_rate": 1.984972906541534e-06,
"loss": 1.3441,
"step": 270
},
{
"epoch": 0.007820846746318267,
"grad_norm": 13.229423522949219,
"learning_rate": 1.9844142785319256e-06,
"loss": 1.4299,
"step": 280
},
{
"epoch": 0.00810016270154392,
"grad_norm": 10.81311321258545,
"learning_rate": 1.9838556505223174e-06,
"loss": 1.4352,
"step": 290
},
{
"epoch": 0.008379478656769572,
"grad_norm": 11.971837043762207,
"learning_rate": 1.9832970225127088e-06,
"loss": 1.3721,
"step": 300
},
{
"epoch": 0.008658794611995225,
"grad_norm": 10.858203887939453,
"learning_rate": 1.9827383945031e-06,
"loss": 1.3593,
"step": 310
},
{
"epoch": 0.008938110567220875,
"grad_norm": 13.713777542114258,
"learning_rate": 1.982179766493492e-06,
"loss": 1.3732,
"step": 320
},
{
"epoch": 0.009217426522446528,
"grad_norm": 10.59002685546875,
"learning_rate": 1.9816211384838833e-06,
"loss": 1.3841,
"step": 330
},
{
"epoch": 0.00949674247767218,
"grad_norm": 12.030527114868164,
"learning_rate": 1.981062510474275e-06,
"loss": 1.3584,
"step": 340
},
{
"epoch": 0.009776058432897833,
"grad_norm": 13.677680969238281,
"learning_rate": 1.980503882464667e-06,
"loss": 1.3468,
"step": 350
},
{
"epoch": 0.010055374388123485,
"grad_norm": 11.954497337341309,
"learning_rate": 1.9799452544550582e-06,
"loss": 1.3558,
"step": 360
},
{
"epoch": 0.010334690343349138,
"grad_norm": 11.743765830993652,
"learning_rate": 1.97938662644545e-06,
"loss": 1.33,
"step": 370
},
{
"epoch": 0.01061400629857479,
"grad_norm": 14.092965126037598,
"learning_rate": 1.978827998435842e-06,
"loss": 1.3247,
"step": 380
},
{
"epoch": 0.010893322253800443,
"grad_norm": 12.351668357849121,
"learning_rate": 1.978269370426233e-06,
"loss": 1.359,
"step": 390
},
{
"epoch": 0.011172638209026095,
"grad_norm": 12.778825759887695,
"learning_rate": 1.9777107424166245e-06,
"loss": 1.3317,
"step": 400
},
{
"epoch": 0.011451954164251748,
"grad_norm": 13.161787986755371,
"learning_rate": 1.9771521144070163e-06,
"loss": 1.3726,
"step": 410
},
{
"epoch": 0.0117312701194774,
"grad_norm": 12.683723449707031,
"learning_rate": 1.9765934863974077e-06,
"loss": 1.2869,
"step": 420
},
{
"epoch": 0.012010586074703053,
"grad_norm": 11.430862426757812,
"learning_rate": 1.9760348583877995e-06,
"loss": 1.3741,
"step": 430
},
{
"epoch": 0.012289902029928705,
"grad_norm": 12.193629264831543,
"learning_rate": 1.9754762303781913e-06,
"loss": 1.3247,
"step": 440
},
{
"epoch": 0.012569217985154356,
"grad_norm": 12.044336318969727,
"learning_rate": 1.9749176023685826e-06,
"loss": 1.3258,
"step": 450
},
{
"epoch": 0.012848533940380009,
"grad_norm": 13.162397384643555,
"learning_rate": 1.9743589743589744e-06,
"loss": 1.3035,
"step": 460
},
{
"epoch": 0.013127849895605661,
"grad_norm": 13.332141876220703,
"learning_rate": 1.9738003463493658e-06,
"loss": 1.2725,
"step": 470
},
{
"epoch": 0.013407165850831314,
"grad_norm": 11.433170318603516,
"learning_rate": 1.9732417183397576e-06,
"loss": 1.293,
"step": 480
},
{
"epoch": 0.013686481806056966,
"grad_norm": 11.537554740905762,
"learning_rate": 1.972683090330149e-06,
"loss": 1.3865,
"step": 490
},
{
"epoch": 0.013965797761282619,
"grad_norm": 13.405438423156738,
"learning_rate": 1.9721244623205407e-06,
"loss": 1.3675,
"step": 500
},
{
"epoch": 0.013993729356805184,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3878909349441528,
"eval_runtime": 33.2902,
"eval_samples_per_second": 15.019,
"eval_steps_per_second": 1.892,
"step": 501
},
{
"epoch": 0.014245113716508271,
"grad_norm": 13.519207000732422,
"learning_rate": 1.971565834310932e-06,
"loss": 1.2924,
"step": 510
},
{
"epoch": 0.014524429671733924,
"grad_norm": 12.743926048278809,
"learning_rate": 1.971007206301324e-06,
"loss": 1.3396,
"step": 520
},
{
"epoch": 0.014803745626959576,
"grad_norm": 11.494955062866211,
"learning_rate": 1.9704485782917157e-06,
"loss": 1.3783,
"step": 530
},
{
"epoch": 0.015083061582185229,
"grad_norm": 13.423910140991211,
"learning_rate": 1.969889950282107e-06,
"loss": 1.3687,
"step": 540
},
{
"epoch": 0.015362377537410881,
"grad_norm": 12.793667793273926,
"learning_rate": 1.969331322272499e-06,
"loss": 1.3364,
"step": 550
},
{
"epoch": 0.015641693492636534,
"grad_norm": 12.060896873474121,
"learning_rate": 1.96877269426289e-06,
"loss": 1.3495,
"step": 560
},
{
"epoch": 0.015921009447862185,
"grad_norm": 10.879355430603027,
"learning_rate": 1.968214066253282e-06,
"loss": 1.2966,
"step": 570
},
{
"epoch": 0.01620032540308784,
"grad_norm": 11.109333038330078,
"learning_rate": 1.9676554382436733e-06,
"loss": 1.3587,
"step": 580
},
{
"epoch": 0.01647964135831349,
"grad_norm": 14.08962345123291,
"learning_rate": 1.967096810234065e-06,
"loss": 1.3881,
"step": 590
},
{
"epoch": 0.016758957313539144,
"grad_norm": 13.27667236328125,
"learning_rate": 1.9665381822244565e-06,
"loss": 1.3097,
"step": 600
},
{
"epoch": 0.017038273268764795,
"grad_norm": 11.379706382751465,
"learning_rate": 1.9659795542148483e-06,
"loss": 1.305,
"step": 610
},
{
"epoch": 0.01731758922399045,
"grad_norm": 12.25299072265625,
"learning_rate": 1.96542092620524e-06,
"loss": 1.3136,
"step": 620
},
{
"epoch": 0.0175969051792161,
"grad_norm": 11.619131088256836,
"learning_rate": 1.9648622981956314e-06,
"loss": 1.3265,
"step": 630
},
{
"epoch": 0.01787622113444175,
"grad_norm": 13.470244407653809,
"learning_rate": 1.9643036701860228e-06,
"loss": 1.3938,
"step": 640
},
{
"epoch": 0.018155537089667405,
"grad_norm": 12.438233375549316,
"learning_rate": 1.9637450421764146e-06,
"loss": 1.3579,
"step": 650
},
{
"epoch": 0.018434853044893056,
"grad_norm": 11.806841850280762,
"learning_rate": 1.9631864141668064e-06,
"loss": 1.3165,
"step": 660
},
{
"epoch": 0.01871416900011871,
"grad_norm": 10.943819999694824,
"learning_rate": 1.9626277861571977e-06,
"loss": 1.3435,
"step": 670
},
{
"epoch": 0.01899348495534436,
"grad_norm": 11.002156257629395,
"learning_rate": 1.9620691581475895e-06,
"loss": 1.3137,
"step": 680
},
{
"epoch": 0.019272800910570015,
"grad_norm": 11.192991256713867,
"learning_rate": 1.961510530137981e-06,
"loss": 1.341,
"step": 690
},
{
"epoch": 0.019552116865795666,
"grad_norm": 11.328652381896973,
"learning_rate": 1.9609519021283727e-06,
"loss": 1.3744,
"step": 700
},
{
"epoch": 0.01983143282102132,
"grad_norm": 11.382583618164062,
"learning_rate": 1.9603932741187645e-06,
"loss": 1.2749,
"step": 710
},
{
"epoch": 0.02011074877624697,
"grad_norm": 9.765230178833008,
"learning_rate": 1.959834646109156e-06,
"loss": 1.3148,
"step": 720
},
{
"epoch": 0.020390064731472625,
"grad_norm": 10.793863296508789,
"learning_rate": 1.959276018099547e-06,
"loss": 1.2991,
"step": 730
},
{
"epoch": 0.020669380686698276,
"grad_norm": 12.697861671447754,
"learning_rate": 1.958717390089939e-06,
"loss": 1.3673,
"step": 740
},
{
"epoch": 0.02094869664192393,
"grad_norm": 11.78731632232666,
"learning_rate": 1.9581587620803308e-06,
"loss": 1.36,
"step": 750
},
{
"epoch": 0.02122801259714958,
"grad_norm": 11.723365783691406,
"learning_rate": 1.957600134070722e-06,
"loss": 1.3558,
"step": 760
},
{
"epoch": 0.02150732855237523,
"grad_norm": 11.155319213867188,
"learning_rate": 1.957041506061114e-06,
"loss": 1.3266,
"step": 770
},
{
"epoch": 0.021786644507600886,
"grad_norm": 11.003241539001465,
"learning_rate": 1.9564828780515053e-06,
"loss": 1.3161,
"step": 780
},
{
"epoch": 0.022065960462826537,
"grad_norm": 11.691163063049316,
"learning_rate": 1.955924250041897e-06,
"loss": 1.3782,
"step": 790
},
{
"epoch": 0.02234527641805219,
"grad_norm": 13.002456665039062,
"learning_rate": 1.955365622032289e-06,
"loss": 1.3738,
"step": 800
},
{
"epoch": 0.02262459237327784,
"grad_norm": 10.829326629638672,
"learning_rate": 1.9548069940226802e-06,
"loss": 1.3089,
"step": 810
},
{
"epoch": 0.022903908328503496,
"grad_norm": 11.259895324707031,
"learning_rate": 1.9542483660130716e-06,
"loss": 1.3002,
"step": 820
},
{
"epoch": 0.023183224283729147,
"grad_norm": 12.811477661132812,
"learning_rate": 1.9536897380034634e-06,
"loss": 1.3126,
"step": 830
},
{
"epoch": 0.0234625402389548,
"grad_norm": 11.347965240478516,
"learning_rate": 1.953131109993855e-06,
"loss": 1.3364,
"step": 840
},
{
"epoch": 0.02374185619418045,
"grad_norm": 12.316996574401855,
"learning_rate": 1.9525724819842465e-06,
"loss": 1.3208,
"step": 850
},
{
"epoch": 0.024021172149406106,
"grad_norm": 11.446920394897461,
"learning_rate": 1.9520138539746383e-06,
"loss": 1.3292,
"step": 860
},
{
"epoch": 0.024300488104631757,
"grad_norm": 11.28432559967041,
"learning_rate": 1.9514552259650297e-06,
"loss": 1.3331,
"step": 870
},
{
"epoch": 0.02457980405985741,
"grad_norm": 11.215639114379883,
"learning_rate": 1.9508965979554215e-06,
"loss": 1.3026,
"step": 880
},
{
"epoch": 0.02485912001508306,
"grad_norm": 11.234190940856934,
"learning_rate": 1.950337969945813e-06,
"loss": 1.2926,
"step": 890
},
{
"epoch": 0.025138435970308712,
"grad_norm": 11.294180870056152,
"learning_rate": 1.9497793419362046e-06,
"loss": 1.3644,
"step": 900
},
{
"epoch": 0.025417751925534367,
"grad_norm": 11.346322059631348,
"learning_rate": 1.949220713926596e-06,
"loss": 1.3124,
"step": 910
},
{
"epoch": 0.025697067880760018,
"grad_norm": 11.497020721435547,
"learning_rate": 1.9486620859169878e-06,
"loss": 1.2695,
"step": 920
},
{
"epoch": 0.025976383835985672,
"grad_norm": 10.896917343139648,
"learning_rate": 1.9481034579073796e-06,
"loss": 1.3141,
"step": 930
},
{
"epoch": 0.026255699791211323,
"grad_norm": 10.956721305847168,
"learning_rate": 1.947544829897771e-06,
"loss": 1.36,
"step": 940
},
{
"epoch": 0.026535015746436977,
"grad_norm": 11.796623229980469,
"learning_rate": 1.9469862018881627e-06,
"loss": 1.3586,
"step": 950
},
{
"epoch": 0.026814331701662628,
"grad_norm": 11.082508087158203,
"learning_rate": 1.946427573878554e-06,
"loss": 1.3514,
"step": 960
},
{
"epoch": 0.027093647656888282,
"grad_norm": 11.789264678955078,
"learning_rate": 1.945868945868946e-06,
"loss": 1.328,
"step": 970
},
{
"epoch": 0.027372963612113933,
"grad_norm": 11.333861351013184,
"learning_rate": 1.9453103178593372e-06,
"loss": 1.2765,
"step": 980
},
{
"epoch": 0.027652279567339587,
"grad_norm": 12.05320930480957,
"learning_rate": 1.944751689849729e-06,
"loss": 1.3679,
"step": 990
},
{
"epoch": 0.027931595522565238,
"grad_norm": 12.946321487426758,
"learning_rate": 1.9441930618401204e-06,
"loss": 1.3105,
"step": 1000
},
{
"epoch": 0.02798745871361037,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.364721655845642,
"eval_runtime": 34.1546,
"eval_samples_per_second": 14.639,
"eval_steps_per_second": 1.845,
"step": 1002
},
{
"epoch": 0.02821091147779089,
"grad_norm": 12.720813751220703,
"learning_rate": 1.943634433830512e-06,
"loss": 1.2763,
"step": 1010
},
{
"epoch": 0.028490227433016543,
"grad_norm": 10.137106895446777,
"learning_rate": 1.943075805820904e-06,
"loss": 1.3177,
"step": 1020
},
{
"epoch": 0.028769543388242193,
"grad_norm": 11.257421493530273,
"learning_rate": 1.9425171778112953e-06,
"loss": 1.3078,
"step": 1030
},
{
"epoch": 0.029048859343467848,
"grad_norm": 11.93409538269043,
"learning_rate": 1.941958549801687e-06,
"loss": 1.3251,
"step": 1040
},
{
"epoch": 0.0293281752986935,
"grad_norm": 12.464277267456055,
"learning_rate": 1.9413999217920785e-06,
"loss": 1.3199,
"step": 1050
},
{
"epoch": 0.029607491253919153,
"grad_norm": 12.42292308807373,
"learning_rate": 1.9408412937824703e-06,
"loss": 1.2815,
"step": 1060
},
{
"epoch": 0.029886807209144804,
"grad_norm": 11.653295516967773,
"learning_rate": 1.9402826657728616e-06,
"loss": 1.2948,
"step": 1070
},
{
"epoch": 0.030166123164370458,
"grad_norm": 12.255006790161133,
"learning_rate": 1.9397240377632534e-06,
"loss": 1.3263,
"step": 1080
},
{
"epoch": 0.03044543911959611,
"grad_norm": 10.424007415771484,
"learning_rate": 1.939165409753645e-06,
"loss": 1.2892,
"step": 1090
},
{
"epoch": 0.030724755074821763,
"grad_norm": 10.664515495300293,
"learning_rate": 1.9386067817440366e-06,
"loss": 1.3407,
"step": 1100
},
{
"epoch": 0.031004071030047414,
"grad_norm": 12.733943939208984,
"learning_rate": 1.9380481537344284e-06,
"loss": 1.348,
"step": 1110
},
{
"epoch": 0.03128338698527307,
"grad_norm": 10.41376781463623,
"learning_rate": 1.9374895257248197e-06,
"loss": 1.3827,
"step": 1120
},
{
"epoch": 0.03156270294049872,
"grad_norm": 13.944782257080078,
"learning_rate": 1.9369308977152115e-06,
"loss": 1.3218,
"step": 1130
},
{
"epoch": 0.03184201889572437,
"grad_norm": 12.373078346252441,
"learning_rate": 1.936372269705603e-06,
"loss": 1.2725,
"step": 1140
},
{
"epoch": 0.03212133485095002,
"grad_norm": 11.583971977233887,
"learning_rate": 1.9358136416959947e-06,
"loss": 1.2983,
"step": 1150
},
{
"epoch": 0.03240065080617568,
"grad_norm": 12.660507202148438,
"learning_rate": 1.935255013686386e-06,
"loss": 1.278,
"step": 1160
},
{
"epoch": 0.03267996676140133,
"grad_norm": 10.222640991210938,
"learning_rate": 1.934696385676778e-06,
"loss": 1.2866,
"step": 1170
},
{
"epoch": 0.03295928271662698,
"grad_norm": 12.668971061706543,
"learning_rate": 1.934137757667169e-06,
"loss": 1.3605,
"step": 1180
},
{
"epoch": 0.03323859867185263,
"grad_norm": 10.59626579284668,
"learning_rate": 1.933579129657561e-06,
"loss": 1.321,
"step": 1190
},
{
"epoch": 0.03351791462707829,
"grad_norm": 11.953704833984375,
"learning_rate": 1.9330205016479528e-06,
"loss": 1.2852,
"step": 1200
},
{
"epoch": 0.03379723058230394,
"grad_norm": 11.2271146774292,
"learning_rate": 1.932461873638344e-06,
"loss": 1.3196,
"step": 1210
},
{
"epoch": 0.03407654653752959,
"grad_norm": 10.453490257263184,
"learning_rate": 1.9319032456287355e-06,
"loss": 1.397,
"step": 1220
},
{
"epoch": 0.03435586249275524,
"grad_norm": 13.665384292602539,
"learning_rate": 1.9313446176191273e-06,
"loss": 1.3058,
"step": 1230
},
{
"epoch": 0.0346351784479809,
"grad_norm": 10.085427284240723,
"learning_rate": 1.930785989609519e-06,
"loss": 1.3081,
"step": 1240
},
{
"epoch": 0.03491449440320655,
"grad_norm": 12.101105690002441,
"learning_rate": 1.9302273615999105e-06,
"loss": 1.345,
"step": 1250
},
{
"epoch": 0.0351938103584322,
"grad_norm": 10.636537551879883,
"learning_rate": 1.9296687335903022e-06,
"loss": 1.3293,
"step": 1260
},
{
"epoch": 0.03547312631365785,
"grad_norm": 12.76969051361084,
"learning_rate": 1.9291101055806936e-06,
"loss": 1.3403,
"step": 1270
},
{
"epoch": 0.0357524422688835,
"grad_norm": 11.625609397888184,
"learning_rate": 1.9285514775710854e-06,
"loss": 1.2967,
"step": 1280
},
{
"epoch": 0.03603175822410916,
"grad_norm": 12.158754348754883,
"learning_rate": 1.927992849561477e-06,
"loss": 1.3727,
"step": 1290
},
{
"epoch": 0.03631107417933481,
"grad_norm": 13.211498260498047,
"learning_rate": 1.9274342215518685e-06,
"loss": 1.2925,
"step": 1300
},
{
"epoch": 0.03659039013456046,
"grad_norm": 16.932209014892578,
"learning_rate": 1.92687559354226e-06,
"loss": 1.3434,
"step": 1310
},
{
"epoch": 0.03686970608978611,
"grad_norm": 10.869868278503418,
"learning_rate": 1.9263169655326517e-06,
"loss": 1.3001,
"step": 1320
},
{
"epoch": 0.03714902204501177,
"grad_norm": 11.199213027954102,
"learning_rate": 1.9257583375230435e-06,
"loss": 1.3927,
"step": 1330
},
{
"epoch": 0.03742833800023742,
"grad_norm": 11.47125244140625,
"learning_rate": 1.925199709513435e-06,
"loss": 1.3426,
"step": 1340
},
{
"epoch": 0.03770765395546307,
"grad_norm": 12.344675064086914,
"learning_rate": 1.9246410815038266e-06,
"loss": 1.3525,
"step": 1350
},
{
"epoch": 0.03798696991068872,
"grad_norm": 12.831677436828613,
"learning_rate": 1.924082453494218e-06,
"loss": 1.329,
"step": 1360
},
{
"epoch": 0.03826628586591438,
"grad_norm": 11.5836763381958,
"learning_rate": 1.92352382548461e-06,
"loss": 1.3188,
"step": 1370
},
{
"epoch": 0.03854560182114003,
"grad_norm": 10.466170310974121,
"learning_rate": 1.9229651974750016e-06,
"loss": 1.3177,
"step": 1380
},
{
"epoch": 0.03882491777636568,
"grad_norm": 12.394039154052734,
"learning_rate": 1.922406569465393e-06,
"loss": 1.3367,
"step": 1390
},
{
"epoch": 0.03910423373159133,
"grad_norm": 10.985048294067383,
"learning_rate": 1.9218479414557843e-06,
"loss": 1.2887,
"step": 1400
},
{
"epoch": 0.03938354968681698,
"grad_norm": 12.47451400756836,
"learning_rate": 1.921289313446176e-06,
"loss": 1.2986,
"step": 1410
},
{
"epoch": 0.03966286564204264,
"grad_norm": 10.245006561279297,
"learning_rate": 1.920730685436568e-06,
"loss": 1.3413,
"step": 1420
},
{
"epoch": 0.03994218159726829,
"grad_norm": 11.382227897644043,
"learning_rate": 1.9201720574269593e-06,
"loss": 1.3652,
"step": 1430
},
{
"epoch": 0.04022149755249394,
"grad_norm": 13.765195846557617,
"learning_rate": 1.919613429417351e-06,
"loss": 1.3105,
"step": 1440
},
{
"epoch": 0.04050081350771959,
"grad_norm": 10.82947063446045,
"learning_rate": 1.9190548014077424e-06,
"loss": 1.3128,
"step": 1450
},
{
"epoch": 0.04078012946294525,
"grad_norm": 9.862834930419922,
"learning_rate": 1.918496173398134e-06,
"loss": 1.3368,
"step": 1460
},
{
"epoch": 0.0410594454181709,
"grad_norm": 9.987138748168945,
"learning_rate": 1.9179375453885256e-06,
"loss": 1.3172,
"step": 1470
},
{
"epoch": 0.04133876137339655,
"grad_norm": 10.993836402893066,
"learning_rate": 1.9173789173789174e-06,
"loss": 1.2961,
"step": 1480
},
{
"epoch": 0.0416180773286222,
"grad_norm": 10.989373207092285,
"learning_rate": 1.9168202893693087e-06,
"loss": 1.2854,
"step": 1490
},
{
"epoch": 0.04189739328384786,
"grad_norm": 14.129310607910156,
"learning_rate": 1.9162616613597005e-06,
"loss": 1.3157,
"step": 1500
},
{
"epoch": 0.04198118807041555,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3546726703643799,
"eval_runtime": 34.0455,
"eval_samples_per_second": 14.686,
"eval_steps_per_second": 1.85,
"step": 1503
},
{
"epoch": 0.04217670923907351,
"grad_norm": 10.534819602966309,
"learning_rate": 1.9157030333500923e-06,
"loss": 1.3115,
"step": 1510
},
{
"epoch": 0.04245602519429916,
"grad_norm": 10.998124122619629,
"learning_rate": 1.9151444053404837e-06,
"loss": 1.2958,
"step": 1520
},
{
"epoch": 0.04273534114952481,
"grad_norm": 10.543405532836914,
"learning_rate": 1.9145857773308754e-06,
"loss": 1.2976,
"step": 1530
},
{
"epoch": 0.04301465710475046,
"grad_norm": 11.423952102661133,
"learning_rate": 1.914027149321267e-06,
"loss": 1.2922,
"step": 1540
},
{
"epoch": 0.04329397305997612,
"grad_norm": 10.33931827545166,
"learning_rate": 1.9134685213116586e-06,
"loss": 1.3221,
"step": 1550
},
{
"epoch": 0.04357328901520177,
"grad_norm": 10.731399536132812,
"learning_rate": 1.91290989330205e-06,
"loss": 1.2949,
"step": 1560
},
{
"epoch": 0.04385260497042742,
"grad_norm": 10.743152618408203,
"learning_rate": 1.9123512652924418e-06,
"loss": 1.275,
"step": 1570
},
{
"epoch": 0.04413192092565307,
"grad_norm": 10.677448272705078,
"learning_rate": 1.911792637282833e-06,
"loss": 1.2822,
"step": 1580
},
{
"epoch": 0.04441123688087873,
"grad_norm": 10.933751106262207,
"learning_rate": 1.911234009273225e-06,
"loss": 1.2784,
"step": 1590
},
{
"epoch": 0.04469055283610438,
"grad_norm": 10.95008659362793,
"learning_rate": 1.9106753812636167e-06,
"loss": 1.337,
"step": 1600
},
{
"epoch": 0.04496986879133003,
"grad_norm": 11.022769927978516,
"learning_rate": 1.910116753254008e-06,
"loss": 1.3194,
"step": 1610
},
{
"epoch": 0.04524918474655568,
"grad_norm": 12.916274070739746,
"learning_rate": 1.9095581252444e-06,
"loss": 1.3023,
"step": 1620
},
{
"epoch": 0.04552850070178134,
"grad_norm": 12.046470642089844,
"learning_rate": 1.9089994972347912e-06,
"loss": 1.2803,
"step": 1630
},
{
"epoch": 0.04580781665700699,
"grad_norm": 10.913056373596191,
"learning_rate": 1.9084408692251826e-06,
"loss": 1.3405,
"step": 1640
},
{
"epoch": 0.04608713261223264,
"grad_norm": 11.769244194030762,
"learning_rate": 1.9078822412155744e-06,
"loss": 1.2995,
"step": 1650
},
{
"epoch": 0.04636644856745829,
"grad_norm": 11.765388488769531,
"learning_rate": 1.907323613205966e-06,
"loss": 1.3457,
"step": 1660
},
{
"epoch": 0.046645764522683944,
"grad_norm": 11.881918907165527,
"learning_rate": 1.9067649851963577e-06,
"loss": 1.3367,
"step": 1670
},
{
"epoch": 0.0469250804779096,
"grad_norm": 10.628633499145508,
"learning_rate": 1.9062063571867493e-06,
"loss": 1.3091,
"step": 1680
},
{
"epoch": 0.04720439643313525,
"grad_norm": 11.146201133728027,
"learning_rate": 1.9056477291771409e-06,
"loss": 1.3041,
"step": 1690
},
{
"epoch": 0.0474837123883609,
"grad_norm": 10.595499992370605,
"learning_rate": 1.9050891011675325e-06,
"loss": 1.3185,
"step": 1700
},
{
"epoch": 0.047763028343586554,
"grad_norm": 12.041298866271973,
"learning_rate": 1.904530473157924e-06,
"loss": 1.3244,
"step": 1710
},
{
"epoch": 0.04804234429881221,
"grad_norm": 11.456694602966309,
"learning_rate": 1.9039718451483156e-06,
"loss": 1.2795,
"step": 1720
},
{
"epoch": 0.04832166025403786,
"grad_norm": 10.448249816894531,
"learning_rate": 1.9034132171387072e-06,
"loss": 1.2914,
"step": 1730
},
{
"epoch": 0.04860097620926351,
"grad_norm": 11.16418170928955,
"learning_rate": 1.9028545891290988e-06,
"loss": 1.3405,
"step": 1740
},
{
"epoch": 0.048880292164489164,
"grad_norm": 11.179234504699707,
"learning_rate": 1.9022959611194903e-06,
"loss": 1.3432,
"step": 1750
},
{
"epoch": 0.04915960811971482,
"grad_norm": 10.457565307617188,
"learning_rate": 1.9017373331098821e-06,
"loss": 1.3358,
"step": 1760
},
{
"epoch": 0.04943892407494047,
"grad_norm": 11.272239685058594,
"learning_rate": 1.9011787051002737e-06,
"loss": 1.2664,
"step": 1770
},
{
"epoch": 0.04971824003016612,
"grad_norm": 11.015891075134277,
"learning_rate": 1.9006200770906653e-06,
"loss": 1.2642,
"step": 1780
},
{
"epoch": 0.049997555985391774,
"grad_norm": 10.243793487548828,
"learning_rate": 1.9000614490810569e-06,
"loss": 1.2335,
"step": 1790
},
{
"epoch": 0.050276871940617425,
"grad_norm": 11.970431327819824,
"learning_rate": 1.8995028210714484e-06,
"loss": 1.2568,
"step": 1800
},
{
"epoch": 0.05055618789584308,
"grad_norm": 9.61301040649414,
"learning_rate": 1.89894419306184e-06,
"loss": 1.2969,
"step": 1810
},
{
"epoch": 0.050835503851068733,
"grad_norm": 10.591397285461426,
"learning_rate": 1.8983855650522316e-06,
"loss": 1.3004,
"step": 1820
},
{
"epoch": 0.051114819806294384,
"grad_norm": 15.13564682006836,
"learning_rate": 1.8978269370426232e-06,
"loss": 1.2676,
"step": 1830
},
{
"epoch": 0.051394135761520035,
"grad_norm": 10.456026077270508,
"learning_rate": 1.8972683090330148e-06,
"loss": 1.254,
"step": 1840
},
{
"epoch": 0.05167345171674569,
"grad_norm": 11.265973091125488,
"learning_rate": 1.8967096810234065e-06,
"loss": 1.2651,
"step": 1850
},
{
"epoch": 0.051952767671971344,
"grad_norm": 10.13062858581543,
"learning_rate": 1.8961510530137981e-06,
"loss": 1.3549,
"step": 1860
},
{
"epoch": 0.052232083627196994,
"grad_norm": 10.586962699890137,
"learning_rate": 1.8955924250041897e-06,
"loss": 1.326,
"step": 1870
},
{
"epoch": 0.052511399582422645,
"grad_norm": 11.121024131774902,
"learning_rate": 1.8950337969945813e-06,
"loss": 1.2838,
"step": 1880
},
{
"epoch": 0.052790715537648296,
"grad_norm": 10.71886920928955,
"learning_rate": 1.8944751689849726e-06,
"loss": 1.2793,
"step": 1890
},
{
"epoch": 0.053070031492873954,
"grad_norm": 10.959943771362305,
"learning_rate": 1.8939165409753644e-06,
"loss": 1.2731,
"step": 1900
},
{
"epoch": 0.053349347448099604,
"grad_norm": 11.72314453125,
"learning_rate": 1.893357912965756e-06,
"loss": 1.3049,
"step": 1910
},
{
"epoch": 0.053628663403325255,
"grad_norm": 11.75049114227295,
"learning_rate": 1.8927992849561476e-06,
"loss": 1.273,
"step": 1920
},
{
"epoch": 0.053907979358550906,
"grad_norm": 11.237908363342285,
"learning_rate": 1.8922406569465392e-06,
"loss": 1.3405,
"step": 1930
},
{
"epoch": 0.054187295313776564,
"grad_norm": 13.297497749328613,
"learning_rate": 1.891682028936931e-06,
"loss": 1.3276,
"step": 1940
},
{
"epoch": 0.054466611269002214,
"grad_norm": 12.209798812866211,
"learning_rate": 1.8911234009273225e-06,
"loss": 1.3376,
"step": 1950
},
{
"epoch": 0.054745927224227865,
"grad_norm": 13.262669563293457,
"learning_rate": 1.890564772917714e-06,
"loss": 1.2976,
"step": 1960
},
{
"epoch": 0.055025243179453516,
"grad_norm": 10.766546249389648,
"learning_rate": 1.8900061449081055e-06,
"loss": 1.3522,
"step": 1970
},
{
"epoch": 0.055304559134679174,
"grad_norm": 10.29268741607666,
"learning_rate": 1.889447516898497e-06,
"loss": 1.28,
"step": 1980
},
{
"epoch": 0.055583875089904825,
"grad_norm": 11.653640747070312,
"learning_rate": 1.8888888888888888e-06,
"loss": 1.2993,
"step": 1990
},
{
"epoch": 0.055863191045130475,
"grad_norm": 10.069348335266113,
"learning_rate": 1.8883302608792804e-06,
"loss": 1.27,
"step": 2000
},
{
"epoch": 0.05597491742722074,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3485850095748901,
"eval_runtime": 34.0417,
"eval_samples_per_second": 14.688,
"eval_steps_per_second": 1.851,
"step": 2004
},
{
"epoch": 0.056142507000356126,
"grad_norm": 10.894604682922363,
"learning_rate": 1.887771632869672e-06,
"loss": 1.3105,
"step": 2010
},
{
"epoch": 0.05642182295558178,
"grad_norm": 11.579715728759766,
"learning_rate": 1.8872130048600636e-06,
"loss": 1.2776,
"step": 2020
},
{
"epoch": 0.056701138910807435,
"grad_norm": 10.074790000915527,
"learning_rate": 1.8866543768504553e-06,
"loss": 1.3366,
"step": 2030
},
{
"epoch": 0.056980454866033085,
"grad_norm": 11.219857215881348,
"learning_rate": 1.886095748840847e-06,
"loss": 1.2873,
"step": 2040
},
{
"epoch": 0.057259770821258736,
"grad_norm": 10.627588272094727,
"learning_rate": 1.8855371208312385e-06,
"loss": 1.3311,
"step": 2050
},
{
"epoch": 0.05753908677648439,
"grad_norm": 10.92846393585205,
"learning_rate": 1.8849784928216299e-06,
"loss": 1.3101,
"step": 2060
},
{
"epoch": 0.057818402731710045,
"grad_norm": 11.262550354003906,
"learning_rate": 1.8844198648120214e-06,
"loss": 1.3465,
"step": 2070
},
{
"epoch": 0.058097718686935695,
"grad_norm": 13.099771499633789,
"learning_rate": 1.8838612368024132e-06,
"loss": 1.3157,
"step": 2080
},
{
"epoch": 0.058377034642161346,
"grad_norm": 9.9907865524292,
"learning_rate": 1.8833026087928048e-06,
"loss": 1.298,
"step": 2090
},
{
"epoch": 0.058656350597387,
"grad_norm": 10.225235939025879,
"learning_rate": 1.8827439807831964e-06,
"loss": 1.2737,
"step": 2100
},
{
"epoch": 0.058935666552612655,
"grad_norm": 14.671952247619629,
"learning_rate": 1.882185352773588e-06,
"loss": 1.2994,
"step": 2110
},
{
"epoch": 0.059214982507838305,
"grad_norm": 10.452831268310547,
"learning_rate": 1.8816267247639797e-06,
"loss": 1.3168,
"step": 2120
},
{
"epoch": 0.059494298463063956,
"grad_norm": 11.753946304321289,
"learning_rate": 1.8810680967543713e-06,
"loss": 1.3209,
"step": 2130
},
{
"epoch": 0.05977361441828961,
"grad_norm": 11.631643295288086,
"learning_rate": 1.8805094687447627e-06,
"loss": 1.3339,
"step": 2140
},
{
"epoch": 0.06005293037351526,
"grad_norm": 11.326909065246582,
"learning_rate": 1.8799508407351543e-06,
"loss": 1.3191,
"step": 2150
},
{
"epoch": 0.060332246328740916,
"grad_norm": 11.047061920166016,
"learning_rate": 1.8793922127255458e-06,
"loss": 1.346,
"step": 2160
},
{
"epoch": 0.060611562283966566,
"grad_norm": 11.53350830078125,
"learning_rate": 1.8788335847159376e-06,
"loss": 1.3125,
"step": 2170
},
{
"epoch": 0.06089087823919222,
"grad_norm": 11.501274108886719,
"learning_rate": 1.8782749567063292e-06,
"loss": 1.3432,
"step": 2180
},
{
"epoch": 0.06117019419441787,
"grad_norm": 11.525626182556152,
"learning_rate": 1.8777163286967208e-06,
"loss": 1.362,
"step": 2190
},
{
"epoch": 0.061449510149643526,
"grad_norm": 13.74886703491211,
"learning_rate": 1.8771577006871124e-06,
"loss": 1.3157,
"step": 2200
},
{
"epoch": 0.061728826104869176,
"grad_norm": 12.192688941955566,
"learning_rate": 1.8765990726775042e-06,
"loss": 1.287,
"step": 2210
},
{
"epoch": 0.06200814206009483,
"grad_norm": 10.64345645904541,
"learning_rate": 1.8760404446678955e-06,
"loss": 1.2499,
"step": 2220
},
{
"epoch": 0.06228745801532048,
"grad_norm": 11.966428756713867,
"learning_rate": 1.875481816658287e-06,
"loss": 1.2789,
"step": 2230
},
{
"epoch": 0.06256677397054614,
"grad_norm": 11.889241218566895,
"learning_rate": 1.8749231886486787e-06,
"loss": 1.2621,
"step": 2240
},
{
"epoch": 0.06284608992577179,
"grad_norm": 13.372054100036621,
"learning_rate": 1.8743645606390702e-06,
"loss": 1.3493,
"step": 2250
},
{
"epoch": 0.06312540588099744,
"grad_norm": 10.879005432128906,
"learning_rate": 1.873805932629462e-06,
"loss": 1.3077,
"step": 2260
},
{
"epoch": 0.06340472183622309,
"grad_norm": 11.956343650817871,
"learning_rate": 1.8732473046198536e-06,
"loss": 1.3108,
"step": 2270
},
{
"epoch": 0.06368403779144874,
"grad_norm": 11.269684791564941,
"learning_rate": 1.8726886766102452e-06,
"loss": 1.2956,
"step": 2280
},
{
"epoch": 0.06396335374667439,
"grad_norm": 13.093775749206543,
"learning_rate": 1.8721300486006368e-06,
"loss": 1.2553,
"step": 2290
},
{
"epoch": 0.06424266970190004,
"grad_norm": 9.943842887878418,
"learning_rate": 1.8715714205910286e-06,
"loss": 1.2936,
"step": 2300
},
{
"epoch": 0.0645219856571257,
"grad_norm": 10.660123825073242,
"learning_rate": 1.87101279258142e-06,
"loss": 1.3319,
"step": 2310
},
{
"epoch": 0.06480130161235136,
"grad_norm": 11.023526191711426,
"learning_rate": 1.8704541645718115e-06,
"loss": 1.3441,
"step": 2320
},
{
"epoch": 0.065080617567577,
"grad_norm": 11.04121208190918,
"learning_rate": 1.869895536562203e-06,
"loss": 1.3157,
"step": 2330
},
{
"epoch": 0.06535993352280266,
"grad_norm": 10.915820121765137,
"learning_rate": 1.8693369085525946e-06,
"loss": 1.2891,
"step": 2340
},
{
"epoch": 0.06563924947802831,
"grad_norm": 11.1669282913208,
"learning_rate": 1.8687782805429864e-06,
"loss": 1.3301,
"step": 2350
},
{
"epoch": 0.06591856543325396,
"grad_norm": 13.473467826843262,
"learning_rate": 1.868219652533378e-06,
"loss": 1.3412,
"step": 2360
},
{
"epoch": 0.06619788138847961,
"grad_norm": 9.66751480102539,
"learning_rate": 1.8676610245237696e-06,
"loss": 1.4053,
"step": 2370
},
{
"epoch": 0.06647719734370526,
"grad_norm": 10.621736526489258,
"learning_rate": 1.8671023965141612e-06,
"loss": 1.301,
"step": 2380
},
{
"epoch": 0.06675651329893093,
"grad_norm": 12.115357398986816,
"learning_rate": 1.8665437685045527e-06,
"loss": 1.3193,
"step": 2390
},
{
"epoch": 0.06703582925415658,
"grad_norm": 10.837126731872559,
"learning_rate": 1.8659851404949443e-06,
"loss": 1.2737,
"step": 2400
},
{
"epoch": 0.06731514520938223,
"grad_norm": 11.175081253051758,
"learning_rate": 1.865426512485336e-06,
"loss": 1.3254,
"step": 2410
},
{
"epoch": 0.06759446116460788,
"grad_norm": 11.028107643127441,
"learning_rate": 1.8648678844757275e-06,
"loss": 1.3037,
"step": 2420
},
{
"epoch": 0.06787377711983353,
"grad_norm": 11.444878578186035,
"learning_rate": 1.864309256466119e-06,
"loss": 1.3237,
"step": 2430
},
{
"epoch": 0.06815309307505918,
"grad_norm": 10.279289245605469,
"learning_rate": 1.8637506284565108e-06,
"loss": 1.2836,
"step": 2440
},
{
"epoch": 0.06843240903028483,
"grad_norm": 10.37401008605957,
"learning_rate": 1.8631920004469024e-06,
"loss": 1.3329,
"step": 2450
},
{
"epoch": 0.06871172498551048,
"grad_norm": 9.833236694335938,
"learning_rate": 1.862633372437294e-06,
"loss": 1.3125,
"step": 2460
},
{
"epoch": 0.06899104094073613,
"grad_norm": 11.059619903564453,
"learning_rate": 1.8620747444276854e-06,
"loss": 1.2746,
"step": 2470
},
{
"epoch": 0.0692703568959618,
"grad_norm": 10.897518157958984,
"learning_rate": 1.8615161164180771e-06,
"loss": 1.2552,
"step": 2480
},
{
"epoch": 0.06954967285118745,
"grad_norm": 12.665666580200195,
"learning_rate": 1.8609574884084687e-06,
"loss": 1.3093,
"step": 2490
},
{
"epoch": 0.0698289888064131,
"grad_norm": 10.878984451293945,
"learning_rate": 1.8603988603988603e-06,
"loss": 1.291,
"step": 2500
},
{
"epoch": 0.06996864678402592,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3446284532546997,
"eval_runtime": 33.87,
"eval_samples_per_second": 14.762,
"eval_steps_per_second": 1.86,
"step": 2505
},
{
"epoch": 0.07010830476163875,
"grad_norm": 11.848414421081543,
"learning_rate": 1.8598402323892519e-06,
"loss": 1.3266,
"step": 2510
},
{
"epoch": 0.0703876207168644,
"grad_norm": 11.258633613586426,
"learning_rate": 1.8592816043796435e-06,
"loss": 1.2747,
"step": 2520
},
{
"epoch": 0.07066693667209005,
"grad_norm": 12.249394416809082,
"learning_rate": 1.8587229763700352e-06,
"loss": 1.2717,
"step": 2530
},
{
"epoch": 0.0709462526273157,
"grad_norm": 11.384076118469238,
"learning_rate": 1.8581643483604268e-06,
"loss": 1.3339,
"step": 2540
},
{
"epoch": 0.07122556858254135,
"grad_norm": 11.27473258972168,
"learning_rate": 1.8576057203508182e-06,
"loss": 1.2737,
"step": 2550
},
{
"epoch": 0.071504884537767,
"grad_norm": 11.083890914916992,
"learning_rate": 1.8570470923412098e-06,
"loss": 1.3492,
"step": 2560
},
{
"epoch": 0.07178420049299267,
"grad_norm": 12.925027847290039,
"learning_rate": 1.8564884643316015e-06,
"loss": 1.3546,
"step": 2570
},
{
"epoch": 0.07206351644821832,
"grad_norm": 11.500834465026855,
"learning_rate": 1.8559298363219931e-06,
"loss": 1.2662,
"step": 2580
},
{
"epoch": 0.07234283240344397,
"grad_norm": 10.518533706665039,
"learning_rate": 1.8553712083123847e-06,
"loss": 1.2815,
"step": 2590
},
{
"epoch": 0.07262214835866962,
"grad_norm": 12.124496459960938,
"learning_rate": 1.8548125803027763e-06,
"loss": 1.312,
"step": 2600
},
{
"epoch": 0.07290146431389527,
"grad_norm": 10.693092346191406,
"learning_rate": 1.8542539522931679e-06,
"loss": 1.3071,
"step": 2610
},
{
"epoch": 0.07318078026912092,
"grad_norm": 9.837552070617676,
"learning_rate": 1.8536953242835596e-06,
"loss": 1.2985,
"step": 2620
},
{
"epoch": 0.07346009622434657,
"grad_norm": 11.058207511901855,
"learning_rate": 1.8531366962739512e-06,
"loss": 1.3406,
"step": 2630
},
{
"epoch": 0.07373941217957222,
"grad_norm": 10.664831161499023,
"learning_rate": 1.8525780682643426e-06,
"loss": 1.3086,
"step": 2640
},
{
"epoch": 0.07401872813479787,
"grad_norm": 11.020722389221191,
"learning_rate": 1.8520194402547342e-06,
"loss": 1.2951,
"step": 2650
},
{
"epoch": 0.07429804409002354,
"grad_norm": 11.75809383392334,
"learning_rate": 1.851460812245126e-06,
"loss": 1.2933,
"step": 2660
},
{
"epoch": 0.07457736004524919,
"grad_norm": 11.260404586791992,
"learning_rate": 1.8509021842355175e-06,
"loss": 1.3669,
"step": 2670
},
{
"epoch": 0.07485667600047484,
"grad_norm": 11.38213062286377,
"learning_rate": 1.8503435562259091e-06,
"loss": 1.3048,
"step": 2680
},
{
"epoch": 0.07513599195570049,
"grad_norm": 10.554960250854492,
"learning_rate": 1.8497849282163007e-06,
"loss": 1.3218,
"step": 2690
},
{
"epoch": 0.07541530791092614,
"grad_norm": 13.747076034545898,
"learning_rate": 1.8492263002066923e-06,
"loss": 1.3201,
"step": 2700
},
{
"epoch": 0.07569462386615179,
"grad_norm": 10.723194122314453,
"learning_rate": 1.848667672197084e-06,
"loss": 1.254,
"step": 2710
},
{
"epoch": 0.07597393982137744,
"grad_norm": 11.047980308532715,
"learning_rate": 1.8481090441874754e-06,
"loss": 1.3657,
"step": 2720
},
{
"epoch": 0.0762532557766031,
"grad_norm": 10.199549674987793,
"learning_rate": 1.847550416177867e-06,
"loss": 1.3775,
"step": 2730
},
{
"epoch": 0.07653257173182876,
"grad_norm": 9.60568904876709,
"learning_rate": 1.8469917881682586e-06,
"loss": 1.3047,
"step": 2740
},
{
"epoch": 0.07681188768705441,
"grad_norm": 10.989706993103027,
"learning_rate": 1.8464331601586501e-06,
"loss": 1.3228,
"step": 2750
},
{
"epoch": 0.07709120364228006,
"grad_norm": 12.18575668334961,
"learning_rate": 1.845874532149042e-06,
"loss": 1.3358,
"step": 2760
},
{
"epoch": 0.07737051959750571,
"grad_norm": 11.24397087097168,
"learning_rate": 1.8453159041394335e-06,
"loss": 1.3065,
"step": 2770
},
{
"epoch": 0.07764983555273136,
"grad_norm": 10.88451862335205,
"learning_rate": 1.844757276129825e-06,
"loss": 1.3278,
"step": 2780
},
{
"epoch": 0.07792915150795701,
"grad_norm": 11.730112075805664,
"learning_rate": 1.8441986481202167e-06,
"loss": 1.2865,
"step": 2790
},
{
"epoch": 0.07820846746318266,
"grad_norm": 11.872193336486816,
"learning_rate": 1.8436400201106082e-06,
"loss": 1.2728,
"step": 2800
},
{
"epoch": 0.07848778341840831,
"grad_norm": 13.440178871154785,
"learning_rate": 1.8430813921009998e-06,
"loss": 1.3169,
"step": 2810
},
{
"epoch": 0.07876709937363396,
"grad_norm": 10.802016258239746,
"learning_rate": 1.8425227640913914e-06,
"loss": 1.2491,
"step": 2820
},
{
"epoch": 0.07904641532885963,
"grad_norm": 11.56015396118164,
"learning_rate": 1.841964136081783e-06,
"loss": 1.313,
"step": 2830
},
{
"epoch": 0.07932573128408528,
"grad_norm": 11.145283699035645,
"learning_rate": 1.8414055080721745e-06,
"loss": 1.293,
"step": 2840
},
{
"epoch": 0.07960504723931093,
"grad_norm": 10.63716983795166,
"learning_rate": 1.8408468800625663e-06,
"loss": 1.3308,
"step": 2850
},
{
"epoch": 0.07988436319453658,
"grad_norm": 11.486001968383789,
"learning_rate": 1.840288252052958e-06,
"loss": 1.3047,
"step": 2860
},
{
"epoch": 0.08016367914976223,
"grad_norm": 10.340072631835938,
"learning_rate": 1.8397296240433495e-06,
"loss": 1.2763,
"step": 2870
},
{
"epoch": 0.08044299510498788,
"grad_norm": 11.177892684936523,
"learning_rate": 1.839170996033741e-06,
"loss": 1.3218,
"step": 2880
},
{
"epoch": 0.08072231106021353,
"grad_norm": 11.822985649108887,
"learning_rate": 1.8386123680241326e-06,
"loss": 1.3039,
"step": 2890
},
{
"epoch": 0.08100162701543918,
"grad_norm": 13.245485305786133,
"learning_rate": 1.8380537400145242e-06,
"loss": 1.281,
"step": 2900
},
{
"epoch": 0.08128094297066484,
"grad_norm": 11.78788948059082,
"learning_rate": 1.8374951120049158e-06,
"loss": 1.2176,
"step": 2910
},
{
"epoch": 0.0815602589258905,
"grad_norm": 11.278291702270508,
"learning_rate": 1.8369364839953074e-06,
"loss": 1.2972,
"step": 2920
},
{
"epoch": 0.08183957488111615,
"grad_norm": 11.119109153747559,
"learning_rate": 1.836377855985699e-06,
"loss": 1.2689,
"step": 2930
},
{
"epoch": 0.0821188908363418,
"grad_norm": 11.489620208740234,
"learning_rate": 1.8358192279760907e-06,
"loss": 1.3288,
"step": 2940
},
{
"epoch": 0.08239820679156745,
"grad_norm": 9.556941032409668,
"learning_rate": 1.8352605999664823e-06,
"loss": 1.3035,
"step": 2950
},
{
"epoch": 0.0826775227467931,
"grad_norm": 11.121188163757324,
"learning_rate": 1.8347019719568739e-06,
"loss": 1.2944,
"step": 2960
},
{
"epoch": 0.08295683870201875,
"grad_norm": 12.729305267333984,
"learning_rate": 1.8341433439472653e-06,
"loss": 1.3125,
"step": 2970
},
{
"epoch": 0.0832361546572444,
"grad_norm": 11.878944396972656,
"learning_rate": 1.833584715937657e-06,
"loss": 1.2959,
"step": 2980
},
{
"epoch": 0.08351547061247006,
"grad_norm": 11.5958833694458,
"learning_rate": 1.8330260879280486e-06,
"loss": 1.3453,
"step": 2990
},
{
"epoch": 0.08379478656769572,
"grad_norm": 12.451947212219238,
"learning_rate": 1.8324674599184402e-06,
"loss": 1.2819,
"step": 3000
},
{
"epoch": 0.0839623761408311,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3420253992080688,
"eval_runtime": 33.8789,
"eval_samples_per_second": 14.758,
"eval_steps_per_second": 1.86,
"step": 3006
},
{
"epoch": 0.08407410252292137,
"grad_norm": 12.807692527770996,
"learning_rate": 1.8319088319088318e-06,
"loss": 1.3238,
"step": 3010
},
{
"epoch": 0.08435341847814702,
"grad_norm": 10.1639404296875,
"learning_rate": 1.8313502038992234e-06,
"loss": 1.2694,
"step": 3020
},
{
"epoch": 0.08463273443337267,
"grad_norm": 11.123089790344238,
"learning_rate": 1.8307915758896151e-06,
"loss": 1.2404,
"step": 3030
},
{
"epoch": 0.08491205038859832,
"grad_norm": 11.976441383361816,
"learning_rate": 1.8302329478800067e-06,
"loss": 1.3319,
"step": 3040
},
{
"epoch": 0.08519136634382397,
"grad_norm": 11.400232315063477,
"learning_rate": 1.829674319870398e-06,
"loss": 1.2899,
"step": 3050
},
{
"epoch": 0.08547068229904962,
"grad_norm": 9.668082237243652,
"learning_rate": 1.8291156918607897e-06,
"loss": 1.3343,
"step": 3060
},
{
"epoch": 0.08574999825427528,
"grad_norm": 9.114018440246582,
"learning_rate": 1.8285570638511814e-06,
"loss": 1.2822,
"step": 3070
},
{
"epoch": 0.08602931420950093,
"grad_norm": 11.763662338256836,
"learning_rate": 1.827998435841573e-06,
"loss": 1.303,
"step": 3080
},
{
"epoch": 0.08630863016472659,
"grad_norm": 12.478301048278809,
"learning_rate": 1.8274398078319646e-06,
"loss": 1.3204,
"step": 3090
},
{
"epoch": 0.08658794611995224,
"grad_norm": 13.733002662658691,
"learning_rate": 1.8268811798223562e-06,
"loss": 1.2763,
"step": 3100
},
{
"epoch": 0.08686726207517789,
"grad_norm": 11.211143493652344,
"learning_rate": 1.8263225518127478e-06,
"loss": 1.3059,
"step": 3110
},
{
"epoch": 0.08714657803040354,
"grad_norm": 10.02708911895752,
"learning_rate": 1.8257639238031395e-06,
"loss": 1.2576,
"step": 3120
},
{
"epoch": 0.0874258939856292,
"grad_norm": 10.271854400634766,
"learning_rate": 1.8252052957935311e-06,
"loss": 1.3526,
"step": 3130
},
{
"epoch": 0.08770520994085484,
"grad_norm": 10.915563583374023,
"learning_rate": 1.8246466677839225e-06,
"loss": 1.2951,
"step": 3140
},
{
"epoch": 0.0879845258960805,
"grad_norm": 12.06615161895752,
"learning_rate": 1.824088039774314e-06,
"loss": 1.2678,
"step": 3150
},
{
"epoch": 0.08826384185130615,
"grad_norm": 11.441333770751953,
"learning_rate": 1.8235294117647058e-06,
"loss": 1.3605,
"step": 3160
},
{
"epoch": 0.0885431578065318,
"grad_norm": 11.135004997253418,
"learning_rate": 1.8229707837550974e-06,
"loss": 1.3638,
"step": 3170
},
{
"epoch": 0.08882247376175746,
"grad_norm": 10.272753715515137,
"learning_rate": 1.822412155745489e-06,
"loss": 1.3748,
"step": 3180
},
{
"epoch": 0.08910178971698311,
"grad_norm": 10.645270347595215,
"learning_rate": 1.8218535277358806e-06,
"loss": 1.3288,
"step": 3190
},
{
"epoch": 0.08938110567220876,
"grad_norm": 11.341635704040527,
"learning_rate": 1.8212948997262722e-06,
"loss": 1.3295,
"step": 3200
},
{
"epoch": 0.08966042162743441,
"grad_norm": 11.285005569458008,
"learning_rate": 1.820736271716664e-06,
"loss": 1.3094,
"step": 3210
},
{
"epoch": 0.08993973758266006,
"grad_norm": 11.092018127441406,
"learning_rate": 1.8201776437070553e-06,
"loss": 1.2616,
"step": 3220
},
{
"epoch": 0.09021905353788572,
"grad_norm": 11.833807945251465,
"learning_rate": 1.8196190156974469e-06,
"loss": 1.2915,
"step": 3230
},
{
"epoch": 0.09049836949311137,
"grad_norm": 11.941621780395508,
"learning_rate": 1.8190603876878385e-06,
"loss": 1.2984,
"step": 3240
},
{
"epoch": 0.09077768544833702,
"grad_norm": 11.135613441467285,
"learning_rate": 1.8185017596782303e-06,
"loss": 1.2638,
"step": 3250
},
{
"epoch": 0.09105700140356268,
"grad_norm": 11.356342315673828,
"learning_rate": 1.8179431316686218e-06,
"loss": 1.3199,
"step": 3260
},
{
"epoch": 0.09133631735878833,
"grad_norm": 11.519587516784668,
"learning_rate": 1.8173845036590134e-06,
"loss": 1.288,
"step": 3270
},
{
"epoch": 0.09161563331401398,
"grad_norm": 11.335143089294434,
"learning_rate": 1.816825875649405e-06,
"loss": 1.2472,
"step": 3280
},
{
"epoch": 0.09189494926923963,
"grad_norm": 12.195459365844727,
"learning_rate": 1.8162672476397966e-06,
"loss": 1.2992,
"step": 3290
},
{
"epoch": 0.09217426522446528,
"grad_norm": 12.05800724029541,
"learning_rate": 1.8157086196301881e-06,
"loss": 1.3177,
"step": 3300
},
{
"epoch": 0.09245358117969094,
"grad_norm": 10.606769561767578,
"learning_rate": 1.8151499916205797e-06,
"loss": 1.2693,
"step": 3310
},
{
"epoch": 0.09273289713491659,
"grad_norm": 9.679693222045898,
"learning_rate": 1.8145913636109713e-06,
"loss": 1.2985,
"step": 3320
},
{
"epoch": 0.09301221309014224,
"grad_norm": 10.03492546081543,
"learning_rate": 1.8140327356013629e-06,
"loss": 1.2724,
"step": 3330
},
{
"epoch": 0.09329152904536789,
"grad_norm": 10.985275268554688,
"learning_rate": 1.8134741075917547e-06,
"loss": 1.2551,
"step": 3340
},
{
"epoch": 0.09357084500059355,
"grad_norm": 11.815603256225586,
"learning_rate": 1.8129154795821462e-06,
"loss": 1.3062,
"step": 3350
},
{
"epoch": 0.0938501609558192,
"grad_norm": 10.699769020080566,
"learning_rate": 1.8123568515725378e-06,
"loss": 1.2841,
"step": 3360
},
{
"epoch": 0.09412947691104485,
"grad_norm": 12.014618873596191,
"learning_rate": 1.8117982235629294e-06,
"loss": 1.3132,
"step": 3370
},
{
"epoch": 0.0944087928662705,
"grad_norm": 11.724242210388184,
"learning_rate": 1.811239595553321e-06,
"loss": 1.289,
"step": 3380
},
{
"epoch": 0.09468810882149616,
"grad_norm": 12.180294036865234,
"learning_rate": 1.8106809675437125e-06,
"loss": 1.3496,
"step": 3390
},
{
"epoch": 0.0949674247767218,
"grad_norm": 10.988664627075195,
"learning_rate": 1.8101223395341041e-06,
"loss": 1.3244,
"step": 3400
},
{
"epoch": 0.09524674073194746,
"grad_norm": 12.344855308532715,
"learning_rate": 1.8095637115244957e-06,
"loss": 1.2996,
"step": 3410
},
{
"epoch": 0.09552605668717311,
"grad_norm": 10.685724258422852,
"learning_rate": 1.8090050835148873e-06,
"loss": 1.261,
"step": 3420
},
{
"epoch": 0.09580537264239876,
"grad_norm": 12.516709327697754,
"learning_rate": 1.808446455505279e-06,
"loss": 1.2756,
"step": 3430
},
{
"epoch": 0.09608468859762442,
"grad_norm": 11.27023983001709,
"learning_rate": 1.8078878274956706e-06,
"loss": 1.2935,
"step": 3440
},
{
"epoch": 0.09636400455285007,
"grad_norm": 12.012152671813965,
"learning_rate": 1.8073291994860622e-06,
"loss": 1.317,
"step": 3450
},
{
"epoch": 0.09664332050807573,
"grad_norm": 11.254688262939453,
"learning_rate": 1.8067705714764538e-06,
"loss": 1.3272,
"step": 3460
},
{
"epoch": 0.09692263646330138,
"grad_norm": 12.010251998901367,
"learning_rate": 1.8062119434668452e-06,
"loss": 1.3732,
"step": 3470
},
{
"epoch": 0.09720195241852703,
"grad_norm": 12.29020881652832,
"learning_rate": 1.805653315457237e-06,
"loss": 1.2978,
"step": 3480
},
{
"epoch": 0.09748126837375268,
"grad_norm": 12.708207130432129,
"learning_rate": 1.8050946874476285e-06,
"loss": 1.3173,
"step": 3490
},
{
"epoch": 0.09776058432897833,
"grad_norm": 11.069357872009277,
"learning_rate": 1.80453605943802e-06,
"loss": 1.3188,
"step": 3500
},
{
"epoch": 0.0979561054976363,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.3392640352249146,
"eval_runtime": 34.0525,
"eval_samples_per_second": 14.683,
"eval_steps_per_second": 1.85,
"step": 3507
},
{
"epoch": 0.09803990028420398,
"grad_norm": 13.221611976623535,
"learning_rate": 1.8039774314284117e-06,
"loss": 1.2449,
"step": 3510
},
{
"epoch": 0.09831921623942964,
"grad_norm": 11.22923755645752,
"learning_rate": 1.8034188034188035e-06,
"loss": 1.3242,
"step": 3520
},
{
"epoch": 0.0985985321946553,
"grad_norm": 10.731654167175293,
"learning_rate": 1.802860175409195e-06,
"loss": 1.363,
"step": 3530
},
{
"epoch": 0.09887784814988095,
"grad_norm": 11.269989967346191,
"learning_rate": 1.8023015473995866e-06,
"loss": 1.2708,
"step": 3540
},
{
"epoch": 0.0991571641051066,
"grad_norm": 10.26361083984375,
"learning_rate": 1.801742919389978e-06,
"loss": 1.3219,
"step": 3550
},
{
"epoch": 0.09943648006033225,
"grad_norm": 10.341995239257812,
"learning_rate": 1.8011842913803696e-06,
"loss": 1.2953,
"step": 3560
},
{
"epoch": 0.0997157960155579,
"grad_norm": 10.96583080291748,
"learning_rate": 1.8006256633707613e-06,
"loss": 1.3132,
"step": 3570
},
{
"epoch": 0.09999511197078355,
"grad_norm": 11.878289222717285,
"learning_rate": 1.800067035361153e-06,
"loss": 1.3109,
"step": 3580
},
{
"epoch": 0.1002744279260092,
"grad_norm": 9.536112785339355,
"learning_rate": 1.7995084073515445e-06,
"loss": 1.3468,
"step": 3590
},
{
"epoch": 0.10055374388123485,
"grad_norm": 10.972228050231934,
"learning_rate": 1.798949779341936e-06,
"loss": 1.2877,
"step": 3600
},
{
"epoch": 0.10083305983646051,
"grad_norm": 13.208352088928223,
"learning_rate": 1.7983911513323279e-06,
"loss": 1.3701,
"step": 3610
},
{
"epoch": 0.10111237579168617,
"grad_norm": 11.069518089294434,
"learning_rate": 1.7978325233227194e-06,
"loss": 1.2269,
"step": 3620
},
{
"epoch": 0.10139169174691182,
"grad_norm": 11.275925636291504,
"learning_rate": 1.797273895313111e-06,
"loss": 1.3039,
"step": 3630
},
{
"epoch": 0.10167100770213747,
"grad_norm": 9.614294052124023,
"learning_rate": 1.7967152673035024e-06,
"loss": 1.2987,
"step": 3640
},
{
"epoch": 0.10195032365736312,
"grad_norm": 11.417302131652832,
"learning_rate": 1.796156639293894e-06,
"loss": 1.3161,
"step": 3650
},
{
"epoch": 0.10222963961258877,
"grad_norm": 13.481733322143555,
"learning_rate": 1.7955980112842857e-06,
"loss": 1.277,
"step": 3660
},
{
"epoch": 0.10250895556781442,
"grad_norm": 12.135738372802734,
"learning_rate": 1.7950393832746773e-06,
"loss": 1.3031,
"step": 3670
},
{
"epoch": 0.10278827152304007,
"grad_norm": 11.81387710571289,
"learning_rate": 1.794480755265069e-06,
"loss": 1.3195,
"step": 3680
},
{
"epoch": 0.10306758747826572,
"grad_norm": 12.341436386108398,
"learning_rate": 1.7939221272554605e-06,
"loss": 1.344,
"step": 3690
},
{
"epoch": 0.10334690343349139,
"grad_norm": 11.813607215881348,
"learning_rate": 1.7933634992458523e-06,
"loss": 1.2456,
"step": 3700
},
{
"epoch": 0.10362621938871704,
"grad_norm": 10.025679588317871,
"learning_rate": 1.7928048712362438e-06,
"loss": 1.3462,
"step": 3710
},
{
"epoch": 0.10390553534394269,
"grad_norm": 11.027300834655762,
"learning_rate": 1.7922462432266352e-06,
"loss": 1.248,
"step": 3720
},
{
"epoch": 0.10418485129916834,
"grad_norm": 10.462127685546875,
"learning_rate": 1.7916876152170268e-06,
"loss": 1.2827,
"step": 3730
},
{
"epoch": 0.10446416725439399,
"grad_norm": 11.07565689086914,
"learning_rate": 1.7911289872074184e-06,
"loss": 1.3317,
"step": 3740
},
{
"epoch": 0.10474348320961964,
"grad_norm": 10.2979097366333,
"learning_rate": 1.7905703591978101e-06,
"loss": 1.2484,
"step": 3750
},
{
"epoch": 0.10502279916484529,
"grad_norm": 11.009065628051758,
"learning_rate": 1.7900117311882017e-06,
"loss": 1.2882,
"step": 3760
},
{
"epoch": 0.10530211512007094,
"grad_norm": 11.308358192443848,
"learning_rate": 1.7894531031785933e-06,
"loss": 1.3095,
"step": 3770
},
{
"epoch": 0.10558143107529659,
"grad_norm": 11.058066368103027,
"learning_rate": 1.7888944751689849e-06,
"loss": 1.3372,
"step": 3780
},
{
"epoch": 0.10586074703052226,
"grad_norm": 13.103239059448242,
"learning_rate": 1.7883358471593767e-06,
"loss": 1.3124,
"step": 3790
},
{
"epoch": 0.10614006298574791,
"grad_norm": 10.5227689743042,
"learning_rate": 1.787777219149768e-06,
"loss": 1.2608,
"step": 3800
},
{
"epoch": 0.10641937894097356,
"grad_norm": 10.993918418884277,
"learning_rate": 1.7872185911401596e-06,
"loss": 1.259,
"step": 3810
},
{
"epoch": 0.10669869489619921,
"grad_norm": 11.612725257873535,
"learning_rate": 1.7866599631305512e-06,
"loss": 1.3046,
"step": 3820
},
{
"epoch": 0.10697801085142486,
"grad_norm": 11.200050354003906,
"learning_rate": 1.7861013351209428e-06,
"loss": 1.3439,
"step": 3830
},
{
"epoch": 0.10725732680665051,
"grad_norm": 12.19509220123291,
"learning_rate": 1.7855427071113346e-06,
"loss": 1.3107,
"step": 3840
},
{
"epoch": 0.10753664276187616,
"grad_norm": 11.498516082763672,
"learning_rate": 1.7849840791017261e-06,
"loss": 1.3341,
"step": 3850
},
{
"epoch": 0.10781595871710181,
"grad_norm": 12.180155754089355,
"learning_rate": 1.7844254510921177e-06,
"loss": 1.2753,
"step": 3860
},
{
"epoch": 0.10809527467232748,
"grad_norm": 10.637706756591797,
"learning_rate": 1.7838668230825093e-06,
"loss": 1.2221,
"step": 3870
},
{
"epoch": 0.10837459062755313,
"grad_norm": 11.029936790466309,
"learning_rate": 1.783308195072901e-06,
"loss": 1.3397,
"step": 3880
},
{
"epoch": 0.10865390658277878,
"grad_norm": 9.736263275146484,
"learning_rate": 1.7827495670632924e-06,
"loss": 1.339,
"step": 3890
},
{
"epoch": 0.10893322253800443,
"grad_norm": 11.16982364654541,
"learning_rate": 1.782190939053684e-06,
"loss": 1.3309,
"step": 3900
},
{
"epoch": 0.10921253849323008,
"grad_norm": 10.91207218170166,
"learning_rate": 1.7816323110440756e-06,
"loss": 1.2543,
"step": 3910
},
{
"epoch": 0.10949185444845573,
"grad_norm": 14.678290367126465,
"learning_rate": 1.7810736830344672e-06,
"loss": 1.355,
"step": 3920
},
{
"epoch": 0.10977117040368138,
"grad_norm": 11.110123634338379,
"learning_rate": 1.780515055024859e-06,
"loss": 1.251,
"step": 3930
},
{
"epoch": 0.11005048635890703,
"grad_norm": 11.788151741027832,
"learning_rate": 1.7799564270152505e-06,
"loss": 1.2544,
"step": 3940
},
{
"epoch": 0.11032980231413268,
"grad_norm": 10.897525787353516,
"learning_rate": 1.7793977990056421e-06,
"loss": 1.2932,
"step": 3950
},
{
"epoch": 0.11060911826935835,
"grad_norm": 12.554097175598145,
"learning_rate": 1.7788391709960337e-06,
"loss": 1.3412,
"step": 3960
},
{
"epoch": 0.110888434224584,
"grad_norm": 11.195846557617188,
"learning_rate": 1.7782805429864253e-06,
"loss": 1.311,
"step": 3970
},
{
"epoch": 0.11116775017980965,
"grad_norm": 11.825657844543457,
"learning_rate": 1.7777219149768168e-06,
"loss": 1.2449,
"step": 3980
},
{
"epoch": 0.1114470661350353,
"grad_norm": 11.154561996459961,
"learning_rate": 1.7771632869672084e-06,
"loss": 1.2969,
"step": 3990
},
{
"epoch": 0.11172638209026095,
"grad_norm": 12.427309036254883,
"learning_rate": 1.7766046589576e-06,
"loss": 1.3205,
"step": 4000
},
{
"epoch": 0.11194983485444147,
"eval_complexity_accuracy": 0.0,
"eval_loss": 1.337980031967163,
"eval_runtime": 33.7197,
"eval_samples_per_second": 14.828,
"eval_steps_per_second": 1.868,
"step": 4008
},
{
"epoch": 0.1120056980454866,
"grad_norm": 11.303837776184082,
"learning_rate": 1.7760460309479916e-06,
"loss": 1.2941,
"step": 4010
},
{
"epoch": 0.11228501400071225,
"grad_norm": 10.283913612365723,
"learning_rate": 1.7754874029383834e-06,
"loss": 1.2637,
"step": 4020
},
{
"epoch": 0.1125643299559379,
"grad_norm": 9.881290435791016,
"learning_rate": 1.774928774928775e-06,
"loss": 1.2764,
"step": 4030
},
{
"epoch": 0.11284364591116355,
"grad_norm": 10.254637718200684,
"learning_rate": 1.7743701469191665e-06,
"loss": 1.3215,
"step": 4040
},
{
"epoch": 0.11312296186638922,
"grad_norm": 11.556249618530273,
"learning_rate": 1.7738115189095579e-06,
"loss": 1.3581,
"step": 4050
},
{
"epoch": 0.11340227782161487,
"grad_norm": 11.59968376159668,
"learning_rate": 1.7732528908999497e-06,
"loss": 1.3089,
"step": 4060
},
{
"epoch": 0.11368159377684052,
"grad_norm": 11.252206802368164,
"learning_rate": 1.7726942628903412e-06,
"loss": 1.242,
"step": 4070
},
{
"epoch": 0.11396090973206617,
"grad_norm": 10.428114891052246,
"learning_rate": 1.7721356348807328e-06,
"loss": 1.3395,
"step": 4080
},
{
"epoch": 0.11424022568729182,
"grad_norm": 12.992630958557129,
"learning_rate": 1.7715770068711244e-06,
"loss": 1.305,
"step": 4090
},
{
"epoch": 0.11451954164251747,
"grad_norm": 10.460079193115234,
"learning_rate": 1.771018378861516e-06,
"loss": 1.2225,
"step": 4100
},
{
"epoch": 0.11479885759774312,
"grad_norm": 10.601390838623047,
"learning_rate": 1.7704597508519078e-06,
"loss": 1.3129,
"step": 4110
},
{
"epoch": 0.11507817355296877,
"grad_norm": 13.683563232421875,
"learning_rate": 1.7699011228422993e-06,
"loss": 1.3081,
"step": 4120
},
{
"epoch": 0.11535748950819444,
"grad_norm": 12.05490493774414,
"learning_rate": 1.769342494832691e-06,
"loss": 1.2893,
"step": 4130
},
{
"epoch": 0.11563680546342009,
"grad_norm": 10.546974182128906,
"learning_rate": 1.7687838668230823e-06,
"loss": 1.3494,
"step": 4140
},
{
"epoch": 0.11591612141864574,
"grad_norm": 11.625492095947266,
"learning_rate": 1.768225238813474e-06,
"loss": 1.2876,
"step": 4150
},
{
"epoch": 0.11619543737387139,
"grad_norm": 11.499431610107422,
"learning_rate": 1.7676666108038656e-06,
"loss": 1.2298,
"step": 4160
},
{
"epoch": 0.11647475332909704,
"grad_norm": 10.968666076660156,
"learning_rate": 1.7671079827942572e-06,
"loss": 1.3229,
"step": 4170
},
{
"epoch": 0.11675406928432269,
"grad_norm": 10.56057071685791,
"learning_rate": 1.7665493547846488e-06,
"loss": 1.2644,
"step": 4180
},
{
"epoch": 0.11703338523954834,
"grad_norm": 10.645150184631348,
"learning_rate": 1.7659907267750404e-06,
"loss": 1.3216,
"step": 4190
},
{
"epoch": 0.117312701194774,
"grad_norm": 10.945796966552734,
"learning_rate": 1.7654320987654322e-06,
"loss": 1.3395,
"step": 4200
},
{
"epoch": 0.11759201714999964,
"grad_norm": 11.30075740814209,
"learning_rate": 1.7648734707558237e-06,
"loss": 1.3201,
"step": 4210
},
{
"epoch": 0.11787133310522531,
"grad_norm": 11.912382125854492,
"learning_rate": 1.764314842746215e-06,
"loss": 1.3076,
"step": 4220
},
{
"epoch": 0.11815064906045096,
"grad_norm": 11.546857833862305,
"learning_rate": 1.7637562147366067e-06,
"loss": 1.2776,
"step": 4230
},
{
"epoch": 0.11842996501567661,
"grad_norm": 11.775701522827148,
"learning_rate": 1.7631975867269985e-06,
"loss": 1.3094,
"step": 4240
},
{
"epoch": 0.11870928097090226,
"grad_norm": 11.965110778808594,
"learning_rate": 1.76263895871739e-06,
"loss": 1.2815,
"step": 4250
},
{
"epoch": 0.11898859692612791,
"grad_norm": 9.932812690734863,
"learning_rate": 1.7620803307077816e-06,
"loss": 1.2965,
"step": 4260
},
{
"epoch": 0.11926791288135356,
"grad_norm": 10.788895606994629,
"learning_rate": 1.7615217026981732e-06,
"loss": 1.3025,
"step": 4270
},
{
"epoch": 0.11954722883657921,
"grad_norm": 12.008225440979004,
"learning_rate": 1.7609630746885648e-06,
"loss": 1.2758,
"step": 4280
},
{
"epoch": 0.11982654479180486,
"grad_norm": 11.157905578613281,
"learning_rate": 1.7604044466789566e-06,
"loss": 1.3369,
"step": 4290
},
{
"epoch": 0.12010586074703052,
"grad_norm": 12.967375755310059,
"learning_rate": 1.759845818669348e-06,
"loss": 1.3124,
"step": 4300
},
{
"epoch": 0.12038517670225618,
"grad_norm": 13.764420509338379,
"learning_rate": 1.7592871906597395e-06,
"loss": 1.3243,
"step": 4310
},
{
"epoch": 0.12066449265748183,
"grad_norm": 11.486067771911621,
"learning_rate": 1.758728562650131e-06,
"loss": 1.2865,
"step": 4320
},
{
"epoch": 0.12094380861270748,
"grad_norm": 11.377238273620605,
"learning_rate": 1.7581699346405229e-06,
"loss": 1.2747,
"step": 4330
},
{
"epoch": 0.12122312456793313,
"grad_norm": 11.644318580627441,
"learning_rate": 1.7576113066309144e-06,
"loss": 1.2855,
"step": 4340
},
{
"epoch": 0.12150244052315878,
"grad_norm": 11.282743453979492,
"learning_rate": 1.757052678621306e-06,
"loss": 1.2109,
"step": 4350
},
{
"epoch": 0.12178175647838443,
"grad_norm": 10.718985557556152,
"learning_rate": 1.7564940506116976e-06,
"loss": 1.3098,
"step": 4360
},
{
"epoch": 0.12206107243361008,
"grad_norm": 10.54099178314209,
"learning_rate": 1.7559354226020892e-06,
"loss": 1.2253,
"step": 4370
},
{
"epoch": 0.12234038838883574,
"grad_norm": 10.001184463500977,
"learning_rate": 1.755376794592481e-06,
"loss": 1.3096,
"step": 4380
},
{
"epoch": 0.1226197043440614,
"grad_norm": 10.10665512084961,
"learning_rate": 1.7548181665828723e-06,
"loss": 1.3204,
"step": 4390
},
{
"epoch": 0.12289902029928705,
"grad_norm": 13.317100524902344,
"learning_rate": 1.754259538573264e-06,
"loss": 1.2701,
"step": 4400
},
{
"epoch": 0.1231783362545127,
"grad_norm": 10.948107719421387,
"learning_rate": 1.7537009105636555e-06,
"loss": 1.3417,
"step": 4410
},
{
"epoch": 0.12345765220973835,
"grad_norm": 11.12563705444336,
"learning_rate": 1.7531422825540473e-06,
"loss": 1.2768,
"step": 4420
},
{
"epoch": 0.123736968164964,
"grad_norm": 11.270187377929688,
"learning_rate": 1.7525836545444389e-06,
"loss": 1.2476,
"step": 4430
},
{
"epoch": 0.12401628412018965,
"grad_norm": 11.370152473449707,
"learning_rate": 1.7520250265348304e-06,
"loss": 1.3711,
"step": 4440
},
{
"epoch": 0.1242956000754153,
"grad_norm": 12.357138633728027,
"learning_rate": 1.751466398525222e-06,
"loss": 1.2697,
"step": 4450
},
{
"epoch": 0.12457491603064096,
"grad_norm": 10.51325511932373,
"learning_rate": 1.7509077705156136e-06,
"loss": 1.3495,
"step": 4460
},
{
"epoch": 0.1248542319858666,
"grad_norm": 14.585171699523926,
"learning_rate": 1.7503491425060052e-06,
"loss": 1.3023,
"step": 4470
},
{
"epoch": 0.12513354794109227,
"grad_norm": 11.234824180603027,
"learning_rate": 1.7497905144963967e-06,
"loss": 1.2785,
"step": 4480
},
{
"epoch": 0.12541286389631792,
"grad_norm": 10.963340759277344,
"learning_rate": 1.7492318864867883e-06,
"loss": 1.3196,
"step": 4490
},
{
"epoch": 0.12569217985154357,
"grad_norm": 10.97410774230957,
"learning_rate": 1.7486732584771799e-06,
"loss": 1.3396,
"step": 4500
},
{
"epoch": 0.12594356421124667,
"eval_complexity_accuracy": 0.912,
"eval_loss": 1.3366564512252808,
"eval_runtime": 33.6692,
"eval_samples_per_second": 14.85,
"eval_steps_per_second": 1.871,
"step": 4509
},
{
"epoch": 0.12597149580676922,
"grad_norm": 10.35742473602295,
"learning_rate": 1.7481146304675715e-06,
"loss": 1.3028,
"step": 4510
},
{
"epoch": 0.12625081176199487,
"grad_norm": 11.008344650268555,
"learning_rate": 1.7475560024579633e-06,
"loss": 1.3369,
"step": 4520
},
{
"epoch": 0.12653012771722053,
"grad_norm": 13.630735397338867,
"learning_rate": 1.7469973744483548e-06,
"loss": 1.2786,
"step": 4530
},
{
"epoch": 0.12680944367244618,
"grad_norm": 11.712303161621094,
"learning_rate": 1.7464387464387464e-06,
"loss": 1.2838,
"step": 4540
},
{
"epoch": 0.12708875962767183,
"grad_norm": 11.680615425109863,
"learning_rate": 1.7458801184291378e-06,
"loss": 1.3096,
"step": 4550
},
{
"epoch": 0.12736807558289748,
"grad_norm": 9.936148643493652,
"learning_rate": 1.7453214904195296e-06,
"loss": 1.3508,
"step": 4560
},
{
"epoch": 0.12764739153812313,
"grad_norm": 10.1597261428833,
"learning_rate": 1.7447628624099211e-06,
"loss": 1.2996,
"step": 4570
},
{
"epoch": 0.12792670749334878,
"grad_norm": 9.299288749694824,
"learning_rate": 1.7442042344003127e-06,
"loss": 1.3327,
"step": 4580
},
{
"epoch": 0.12820602344857443,
"grad_norm": 11.090012550354004,
"learning_rate": 1.7436456063907043e-06,
"loss": 1.3217,
"step": 4590
},
{
"epoch": 0.12848533940380008,
"grad_norm": 10.919537544250488,
"learning_rate": 1.7430869783810959e-06,
"loss": 1.2738,
"step": 4600
},
{
"epoch": 0.12876465535902576,
"grad_norm": 10.606612205505371,
"learning_rate": 1.7425283503714877e-06,
"loss": 1.3391,
"step": 4610
},
{
"epoch": 0.1290439713142514,
"grad_norm": 11.103971481323242,
"learning_rate": 1.7419697223618792e-06,
"loss": 1.2768,
"step": 4620
},
{
"epoch": 0.12932328726947706,
"grad_norm": 10.45857048034668,
"learning_rate": 1.7414110943522708e-06,
"loss": 1.292,
"step": 4630
},
{
"epoch": 0.1296026032247027,
"grad_norm": 12.78720760345459,
"learning_rate": 1.7408524663426622e-06,
"loss": 1.2728,
"step": 4640
},
{
"epoch": 0.12988191917992836,
"grad_norm": 10.232451438903809,
"learning_rate": 1.740293838333054e-06,
"loss": 1.305,
"step": 4650
},
{
"epoch": 0.130161235135154,
"grad_norm": 10.413008689880371,
"learning_rate": 1.7397352103234455e-06,
"loss": 1.3135,
"step": 4660
},
{
"epoch": 0.13044055109037966,
"grad_norm": 11.938608169555664,
"learning_rate": 1.7391765823138371e-06,
"loss": 1.3101,
"step": 4670
},
{
"epoch": 0.13071986704560531,
"grad_norm": 10.876611709594727,
"learning_rate": 1.7386179543042287e-06,
"loss": 1.2753,
"step": 4680
},
{
"epoch": 0.13099918300083097,
"grad_norm": 10.039010047912598,
"learning_rate": 1.7380593262946203e-06,
"loss": 1.2957,
"step": 4690
},
{
"epoch": 0.13127849895605662,
"grad_norm": 13.189595222473145,
"learning_rate": 1.737500698285012e-06,
"loss": 1.2917,
"step": 4700
},
{
"epoch": 0.13155781491128227,
"grad_norm": 11.356356620788574,
"learning_rate": 1.7369420702754036e-06,
"loss": 1.3185,
"step": 4710
},
{
"epoch": 0.13183713086650792,
"grad_norm": 11.284613609313965,
"learning_rate": 1.736383442265795e-06,
"loss": 1.2605,
"step": 4720
},
{
"epoch": 0.13211644682173357,
"grad_norm": 9.668716430664062,
"learning_rate": 1.7358248142561866e-06,
"loss": 1.3078,
"step": 4730
},
{
"epoch": 0.13239576277695922,
"grad_norm": 12.375937461853027,
"learning_rate": 1.7352661862465784e-06,
"loss": 1.3748,
"step": 4740
},
{
"epoch": 0.13267507873218487,
"grad_norm": 11.52265453338623,
"learning_rate": 1.73470755823697e-06,
"loss": 1.3253,
"step": 4750
},
{
"epoch": 0.13295439468741052,
"grad_norm": 10.54103946685791,
"learning_rate": 1.7341489302273615e-06,
"loss": 1.2973,
"step": 4760
},
{
"epoch": 0.13323371064263617,
"grad_norm": 11.810563087463379,
"learning_rate": 1.733590302217753e-06,
"loss": 1.2983,
"step": 4770
},
{
"epoch": 0.13351302659786185,
"grad_norm": 11.471932411193848,
"learning_rate": 1.7330316742081447e-06,
"loss": 1.3038,
"step": 4780
},
{
"epoch": 0.1337923425530875,
"grad_norm": 11.196157455444336,
"learning_rate": 1.7324730461985365e-06,
"loss": 1.339,
"step": 4790
},
{
"epoch": 0.13407165850831315,
"grad_norm": 10.879687309265137,
"learning_rate": 1.7319144181889278e-06,
"loss": 1.2856,
"step": 4800
},
{
"epoch": 0.1343509744635388,
"grad_norm": 10.327743530273438,
"learning_rate": 1.7313557901793194e-06,
"loss": 1.3102,
"step": 4810
},
{
"epoch": 0.13463029041876445,
"grad_norm": 12.245965003967285,
"learning_rate": 1.730797162169711e-06,
"loss": 1.3179,
"step": 4820
},
{
"epoch": 0.1349096063739901,
"grad_norm": 10.2786226272583,
"learning_rate": 1.7302385341601028e-06,
"loss": 1.2797,
"step": 4830
},
{
"epoch": 0.13518892232921575,
"grad_norm": 10.446268081665039,
"learning_rate": 1.7296799061504943e-06,
"loss": 1.2856,
"step": 4840
},
{
"epoch": 0.1354682382844414,
"grad_norm": 11.422130584716797,
"learning_rate": 1.729121278140886e-06,
"loss": 1.323,
"step": 4850
},
{
"epoch": 0.13574755423966706,
"grad_norm": 11.97488021850586,
"learning_rate": 1.7285626501312775e-06,
"loss": 1.3054,
"step": 4860
},
{
"epoch": 0.1360268701948927,
"grad_norm": 11.220852851867676,
"learning_rate": 1.728004022121669e-06,
"loss": 1.3171,
"step": 4870
},
{
"epoch": 0.13630618615011836,
"grad_norm": 9.52205753326416,
"learning_rate": 1.7274453941120609e-06,
"loss": 1.2387,
"step": 4880
},
{
"epoch": 0.136585502105344,
"grad_norm": 10.432751655578613,
"learning_rate": 1.7268867661024522e-06,
"loss": 1.2646,
"step": 4890
},
{
"epoch": 0.13686481806056966,
"grad_norm": 11.69746208190918,
"learning_rate": 1.7263281380928438e-06,
"loss": 1.2954,
"step": 4900
},
{
"epoch": 0.1371441340157953,
"grad_norm": 10.778327941894531,
"learning_rate": 1.7257695100832354e-06,
"loss": 1.28,
"step": 4910
},
{
"epoch": 0.13742344997102096,
"grad_norm": 11.078811645507812,
"learning_rate": 1.7252108820736272e-06,
"loss": 1.2915,
"step": 4920
},
{
"epoch": 0.1377027659262466,
"grad_norm": 11.492058753967285,
"learning_rate": 1.7246522540640187e-06,
"loss": 1.2967,
"step": 4930
},
{
"epoch": 0.13798208188147226,
"grad_norm": 10.493326187133789,
"learning_rate": 1.7240936260544103e-06,
"loss": 1.3236,
"step": 4940
},
{
"epoch": 0.1382613978366979,
"grad_norm": 10.878108978271484,
"learning_rate": 1.723534998044802e-06,
"loss": 1.2702,
"step": 4950
},
{
"epoch": 0.1385407137919236,
"grad_norm": 11.983351707458496,
"learning_rate": 1.7229763700351935e-06,
"loss": 1.314,
"step": 4960
},
{
"epoch": 0.13882002974714924,
"grad_norm": 10.559981346130371,
"learning_rate": 1.722417742025585e-06,
"loss": 1.3231,
"step": 4970
},
{
"epoch": 0.1390993457023749,
"grad_norm": 12.265423774719238,
"learning_rate": 1.7218591140159766e-06,
"loss": 1.3418,
"step": 4980
},
{
"epoch": 0.13937866165760054,
"grad_norm": 9.850886344909668,
"learning_rate": 1.7213004860063682e-06,
"loss": 1.2421,
"step": 4990
},
{
"epoch": 0.1396579776128262,
"grad_norm": 10.524002075195312,
"learning_rate": 1.7207418579967598e-06,
"loss": 1.242,
"step": 5000
},
{
"epoch": 0.13993729356805185,
"grad_norm": 12.710641860961914,
"learning_rate": 1.7201832299871516e-06,
"loss": 1.2802,
"step": 5010
},
{
"epoch": 0.13993729356805185,
"eval_complexity_accuracy": 0.916,
"eval_loss": 1.3355051279067993,
"eval_runtime": 33.5364,
"eval_samples_per_second": 14.909,
"eval_steps_per_second": 1.879,
"step": 5010
},
{
"epoch": 0.1402166095232775,
"grad_norm": 10.802959442138672,
"learning_rate": 1.7196246019775432e-06,
"loss": 1.2864,
"step": 5020
},
{
"epoch": 0.14049592547850315,
"grad_norm": 10.689055442810059,
"learning_rate": 1.7190659739679347e-06,
"loss": 1.2735,
"step": 5030
},
{
"epoch": 0.1407752414337288,
"grad_norm": 11.609500885009766,
"learning_rate": 1.7185073459583263e-06,
"loss": 1.3131,
"step": 5040
},
{
"epoch": 0.14105455738895445,
"grad_norm": 11.694178581237793,
"learning_rate": 1.7179487179487177e-06,
"loss": 1.2796,
"step": 5050
},
{
"epoch": 0.1413338733441801,
"grad_norm": 10.71261215209961,
"learning_rate": 1.7173900899391095e-06,
"loss": 1.2928,
"step": 5060
},
{
"epoch": 0.14161318929940575,
"grad_norm": 11.323657989501953,
"learning_rate": 1.716831461929501e-06,
"loss": 1.3168,
"step": 5070
},
{
"epoch": 0.1418925052546314,
"grad_norm": 11.165552139282227,
"learning_rate": 1.7162728339198926e-06,
"loss": 1.3048,
"step": 5080
},
{
"epoch": 0.14217182120985705,
"grad_norm": 10.069772720336914,
"learning_rate": 1.7157142059102842e-06,
"loss": 1.3143,
"step": 5090
},
{
"epoch": 0.1424511371650827,
"grad_norm": 11.59792709350586,
"learning_rate": 1.715155577900676e-06,
"loss": 1.2753,
"step": 5100
},
{
"epoch": 0.14273045312030835,
"grad_norm": 10.197514533996582,
"learning_rate": 1.7145969498910676e-06,
"loss": 1.3432,
"step": 5110
},
{
"epoch": 0.143009769075534,
"grad_norm": 10.098687171936035,
"learning_rate": 1.7140383218814591e-06,
"loss": 1.2387,
"step": 5120
},
{
"epoch": 0.14328908503075968,
"grad_norm": 13.285723686218262,
"learning_rate": 1.7134796938718507e-06,
"loss": 1.2843,
"step": 5130
},
{
"epoch": 0.14356840098598533,
"grad_norm": 14.88563346862793,
"learning_rate": 1.712921065862242e-06,
"loss": 1.3464,
"step": 5140
},
{
"epoch": 0.14384771694121098,
"grad_norm": 10.287967681884766,
"learning_rate": 1.7123624378526339e-06,
"loss": 1.2919,
"step": 5150
},
{
"epoch": 0.14412703289643664,
"grad_norm": 13.416029930114746,
"learning_rate": 1.7118038098430254e-06,
"loss": 1.304,
"step": 5160
},
{
"epoch": 0.1444063488516623,
"grad_norm": 10.358808517456055,
"learning_rate": 1.711245181833417e-06,
"loss": 1.2667,
"step": 5170
},
{
"epoch": 0.14468566480688794,
"grad_norm": 9.454345703125,
"learning_rate": 1.7106865538238086e-06,
"loss": 1.2677,
"step": 5180
},
{
"epoch": 0.1449649807621136,
"grad_norm": 10.137917518615723,
"learning_rate": 1.7101279258142004e-06,
"loss": 1.246,
"step": 5190
},
{
"epoch": 0.14524429671733924,
"grad_norm": 10.27364730834961,
"learning_rate": 1.709569297804592e-06,
"loss": 1.2487,
"step": 5200
},
{
"epoch": 0.1455236126725649,
"grad_norm": 11.590679168701172,
"learning_rate": 1.7090106697949835e-06,
"loss": 1.3315,
"step": 5210
},
{
"epoch": 0.14580292862779054,
"grad_norm": 12.223170280456543,
"learning_rate": 1.708452041785375e-06,
"loss": 1.3591,
"step": 5220
},
{
"epoch": 0.1460822445830162,
"grad_norm": 10.8696928024292,
"learning_rate": 1.7078934137757665e-06,
"loss": 1.2855,
"step": 5230
},
{
"epoch": 0.14636156053824184,
"grad_norm": 10.847172737121582,
"learning_rate": 1.7073347857661583e-06,
"loss": 1.2744,
"step": 5240
},
{
"epoch": 0.1466408764934675,
"grad_norm": 11.290687561035156,
"learning_rate": 1.7067761577565498e-06,
"loss": 1.2815,
"step": 5250
},
{
"epoch": 0.14692019244869314,
"grad_norm": 10.246102333068848,
"learning_rate": 1.7062175297469414e-06,
"loss": 1.2697,
"step": 5260
},
{
"epoch": 0.1471995084039188,
"grad_norm": 10.220574378967285,
"learning_rate": 1.705658901737333e-06,
"loss": 1.274,
"step": 5270
},
{
"epoch": 0.14747882435914444,
"grad_norm": 11.137274742126465,
"learning_rate": 1.7051002737277248e-06,
"loss": 1.2915,
"step": 5280
},
{
"epoch": 0.1477581403143701,
"grad_norm": 11.349177360534668,
"learning_rate": 1.7045416457181164e-06,
"loss": 1.3005,
"step": 5290
},
{
"epoch": 0.14803745626959575,
"grad_norm": 11.108057975769043,
"learning_rate": 1.7039830177085077e-06,
"loss": 1.2922,
"step": 5300
},
{
"epoch": 0.14831677222482142,
"grad_norm": 10.836882591247559,
"learning_rate": 1.7034243896988993e-06,
"loss": 1.29,
"step": 5310
},
{
"epoch": 0.14859608818004708,
"grad_norm": 11.927931785583496,
"learning_rate": 1.7028657616892909e-06,
"loss": 1.3138,
"step": 5320
},
{
"epoch": 0.14887540413527273,
"grad_norm": 10.31083869934082,
"learning_rate": 1.7023071336796827e-06,
"loss": 1.3356,
"step": 5330
},
{
"epoch": 0.14915472009049838,
"grad_norm": 12.571051597595215,
"learning_rate": 1.7017485056700742e-06,
"loss": 1.3247,
"step": 5340
},
{
"epoch": 0.14943403604572403,
"grad_norm": 11.460820198059082,
"learning_rate": 1.7011898776604658e-06,
"loss": 1.3115,
"step": 5350
},
{
"epoch": 0.14971335200094968,
"grad_norm": 11.103178977966309,
"learning_rate": 1.7006312496508574e-06,
"loss": 1.288,
"step": 5360
},
{
"epoch": 0.14999266795617533,
"grad_norm": 11.281828880310059,
"learning_rate": 1.7000726216412492e-06,
"loss": 1.2285,
"step": 5370
},
{
"epoch": 0.15027198391140098,
"grad_norm": 12.560543060302734,
"learning_rate": 1.6995139936316408e-06,
"loss": 1.3078,
"step": 5380
},
{
"epoch": 0.15055129986662663,
"grad_norm": 10.196359634399414,
"learning_rate": 1.6989553656220321e-06,
"loss": 1.3406,
"step": 5390
},
{
"epoch": 0.15083061582185228,
"grad_norm": 10.276470184326172,
"learning_rate": 1.6983967376124237e-06,
"loss": 1.3514,
"step": 5400
},
{
"epoch": 0.15110993177707793,
"grad_norm": 10.547111511230469,
"learning_rate": 1.6978381096028153e-06,
"loss": 1.2428,
"step": 5410
},
{
"epoch": 0.15138924773230358,
"grad_norm": 14.352306365966797,
"learning_rate": 1.697279481593207e-06,
"loss": 1.3123,
"step": 5420
},
{
"epoch": 0.15166856368752923,
"grad_norm": 11.18830394744873,
"learning_rate": 1.6967208535835986e-06,
"loss": 1.2438,
"step": 5430
},
{
"epoch": 0.15194787964275489,
"grad_norm": 10.590067863464355,
"learning_rate": 1.6961622255739902e-06,
"loss": 1.3224,
"step": 5440
},
{
"epoch": 0.15222719559798054,
"grad_norm": 10.839982032775879,
"learning_rate": 1.6956035975643818e-06,
"loss": 1.284,
"step": 5450
},
{
"epoch": 0.1525065115532062,
"grad_norm": 10.421679496765137,
"learning_rate": 1.6950449695547736e-06,
"loss": 1.2974,
"step": 5460
},
{
"epoch": 0.15278582750843184,
"grad_norm": 10.920546531677246,
"learning_rate": 1.694486341545165e-06,
"loss": 1.3018,
"step": 5470
},
{
"epoch": 0.15306514346365752,
"grad_norm": 10.71149730682373,
"learning_rate": 1.6939277135355565e-06,
"loss": 1.3121,
"step": 5480
},
{
"epoch": 0.15334445941888317,
"grad_norm": 10.763243675231934,
"learning_rate": 1.6933690855259481e-06,
"loss": 1.2922,
"step": 5490
},
{
"epoch": 0.15362377537410882,
"grad_norm": 12.36917781829834,
"learning_rate": 1.6928104575163397e-06,
"loss": 1.2845,
"step": 5500
}
],
"logging_steps": 10,
"max_steps": 35802,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}