llama-debug-unlearn / trainer_state.json
jackysnake's picture
Upload folder using huggingface_hub
0b29036 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4218,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007112375533428165,
"grad_norm": 27.90213406968924,
"learning_rate": 2.132701421800948e-07,
"loss": 3.2736,
"step": 10
},
{
"epoch": 0.01422475106685633,
"grad_norm": 30.079497662873973,
"learning_rate": 4.502369668246446e-07,
"loss": 3.0907,
"step": 20
},
{
"epoch": 0.021337126600284494,
"grad_norm": 22.857450095219185,
"learning_rate": 6.872037914691944e-07,
"loss": 2.5585,
"step": 30
},
{
"epoch": 0.02844950213371266,
"grad_norm": 9.747429876049775,
"learning_rate": 9.241706161137441e-07,
"loss": 2.0379,
"step": 40
},
{
"epoch": 0.03556187766714083,
"grad_norm": 7.291577115660012,
"learning_rate": 1.161137440758294e-06,
"loss": 1.8576,
"step": 50
},
{
"epoch": 0.04267425320056899,
"grad_norm": 5.532213240888032,
"learning_rate": 1.3981042654028437e-06,
"loss": 1.7949,
"step": 60
},
{
"epoch": 0.049786628733997154,
"grad_norm": 5.933113444425918,
"learning_rate": 1.6350710900473934e-06,
"loss": 1.717,
"step": 70
},
{
"epoch": 0.05689900426742532,
"grad_norm": 6.263277176684849,
"learning_rate": 1.8720379146919433e-06,
"loss": 1.6602,
"step": 80
},
{
"epoch": 0.06401137980085349,
"grad_norm": 5.6983842698392335,
"learning_rate": 2.109004739336493e-06,
"loss": 1.64,
"step": 90
},
{
"epoch": 0.07112375533428165,
"grad_norm": 5.6269726113930885,
"learning_rate": 2.345971563981043e-06,
"loss": 1.5702,
"step": 100
},
{
"epoch": 0.07823613086770982,
"grad_norm": 6.030572227568665,
"learning_rate": 2.5829383886255925e-06,
"loss": 1.5699,
"step": 110
},
{
"epoch": 0.08534850640113797,
"grad_norm": 5.647899868810465,
"learning_rate": 2.8199052132701426e-06,
"loss": 1.5838,
"step": 120
},
{
"epoch": 0.09246088193456614,
"grad_norm": 5.411803675210431,
"learning_rate": 3.0568720379146923e-06,
"loss": 1.5524,
"step": 130
},
{
"epoch": 0.09957325746799431,
"grad_norm": 6.081650138994435,
"learning_rate": 3.293838862559242e-06,
"loss": 1.4833,
"step": 140
},
{
"epoch": 0.10668563300142248,
"grad_norm": 5.828711910857202,
"learning_rate": 3.5308056872037916e-06,
"loss": 1.4856,
"step": 150
},
{
"epoch": 0.11379800853485064,
"grad_norm": 6.169902161575631,
"learning_rate": 3.7677725118483417e-06,
"loss": 1.5518,
"step": 160
},
{
"epoch": 0.12091038406827881,
"grad_norm": 6.587348119423249,
"learning_rate": 4.004739336492891e-06,
"loss": 1.4465,
"step": 170
},
{
"epoch": 0.12802275960170698,
"grad_norm": 5.895125466115613,
"learning_rate": 4.2417061611374415e-06,
"loss": 1.4266,
"step": 180
},
{
"epoch": 0.13513513513513514,
"grad_norm": 6.8094291227728645,
"learning_rate": 4.478672985781991e-06,
"loss": 1.3488,
"step": 190
},
{
"epoch": 0.1422475106685633,
"grad_norm": 5.67979253930217,
"learning_rate": 4.715639810426541e-06,
"loss": 1.3592,
"step": 200
},
{
"epoch": 0.14935988620199148,
"grad_norm": 5.707997611061335,
"learning_rate": 4.952606635071091e-06,
"loss": 1.3454,
"step": 210
},
{
"epoch": 0.15647226173541964,
"grad_norm": 5.768661138747192,
"learning_rate": 5.18957345971564e-06,
"loss": 1.3214,
"step": 220
},
{
"epoch": 0.16358463726884778,
"grad_norm": 6.881542275829192,
"learning_rate": 5.42654028436019e-06,
"loss": 1.3095,
"step": 230
},
{
"epoch": 0.17069701280227595,
"grad_norm": 6.71203878660968,
"learning_rate": 5.66350710900474e-06,
"loss": 1.3164,
"step": 240
},
{
"epoch": 0.17780938833570412,
"grad_norm": 6.234033501973343,
"learning_rate": 5.90047393364929e-06,
"loss": 1.2996,
"step": 250
},
{
"epoch": 0.18492176386913228,
"grad_norm": 8.33843806575378,
"learning_rate": 6.137440758293839e-06,
"loss": 1.2654,
"step": 260
},
{
"epoch": 0.19203413940256045,
"grad_norm": 7.595129348800133,
"learning_rate": 6.374407582938389e-06,
"loss": 1.206,
"step": 270
},
{
"epoch": 0.19914651493598862,
"grad_norm": 6.466022627087353,
"learning_rate": 6.611374407582939e-06,
"loss": 1.1502,
"step": 280
},
{
"epoch": 0.20625889046941678,
"grad_norm": 5.968433152101503,
"learning_rate": 6.848341232227489e-06,
"loss": 1.1757,
"step": 290
},
{
"epoch": 0.21337126600284495,
"grad_norm": 6.614463560965846,
"learning_rate": 7.085308056872039e-06,
"loss": 1.1513,
"step": 300
},
{
"epoch": 0.22048364153627312,
"grad_norm": 6.349655408064992,
"learning_rate": 7.322274881516588e-06,
"loss": 1.1687,
"step": 310
},
{
"epoch": 0.22759601706970128,
"grad_norm": 7.059546274489307,
"learning_rate": 7.559241706161138e-06,
"loss": 1.0784,
"step": 320
},
{
"epoch": 0.23470839260312945,
"grad_norm": 7.174629090661151,
"learning_rate": 7.796208530805689e-06,
"loss": 1.1724,
"step": 330
},
{
"epoch": 0.24182076813655762,
"grad_norm": 7.1852274620100625,
"learning_rate": 8.033175355450237e-06,
"loss": 1.0926,
"step": 340
},
{
"epoch": 0.24893314366998578,
"grad_norm": 6.094122405795049,
"learning_rate": 8.270142180094787e-06,
"loss": 1.0875,
"step": 350
},
{
"epoch": 0.25604551920341395,
"grad_norm": 6.582672868116728,
"learning_rate": 8.507109004739337e-06,
"loss": 1.0287,
"step": 360
},
{
"epoch": 0.2631578947368421,
"grad_norm": 6.88641059493668,
"learning_rate": 8.744075829383887e-06,
"loss": 1.0029,
"step": 370
},
{
"epoch": 0.2702702702702703,
"grad_norm": 7.170889864871836,
"learning_rate": 8.981042654028437e-06,
"loss": 0.9941,
"step": 380
},
{
"epoch": 0.2773826458036984,
"grad_norm": 6.028377283203336,
"learning_rate": 9.218009478672988e-06,
"loss": 0.8674,
"step": 390
},
{
"epoch": 0.2844950213371266,
"grad_norm": 6.377454467317432,
"learning_rate": 9.454976303317538e-06,
"loss": 0.9614,
"step": 400
},
{
"epoch": 0.29160739687055476,
"grad_norm": 7.795658548149459,
"learning_rate": 9.691943127962086e-06,
"loss": 0.888,
"step": 410
},
{
"epoch": 0.29871977240398295,
"grad_norm": 7.482126849367938,
"learning_rate": 9.928909952606636e-06,
"loss": 0.9175,
"step": 420
},
{
"epoch": 0.3058321479374111,
"grad_norm": 6.2359374732919655,
"learning_rate": 9.99991609608766e-06,
"loss": 0.8985,
"step": 430
},
{
"epoch": 0.3129445234708393,
"grad_norm": 6.511421095028972,
"learning_rate": 9.999505144928566e-06,
"loss": 0.9245,
"step": 440
},
{
"epoch": 0.3200568990042674,
"grad_norm": 6.684499797692544,
"learning_rate": 9.998751763712045e-06,
"loss": 0.8095,
"step": 450
},
{
"epoch": 0.32716927453769556,
"grad_norm": 6.52623510868485,
"learning_rate": 9.997656004039284e-06,
"loss": 0.7917,
"step": 460
},
{
"epoch": 0.33428165007112376,
"grad_norm": 6.391785139835116,
"learning_rate": 9.99621794096192e-06,
"loss": 0.8154,
"step": 470
},
{
"epoch": 0.3413940256045519,
"grad_norm": 6.061600108949675,
"learning_rate": 9.994437672976904e-06,
"loss": 0.7042,
"step": 480
},
{
"epoch": 0.3485064011379801,
"grad_norm": 7.003990519280338,
"learning_rate": 9.99231532201976e-06,
"loss": 0.7912,
"step": 490
},
{
"epoch": 0.35561877667140823,
"grad_norm": 6.376557739362439,
"learning_rate": 9.989851033456224e-06,
"loss": 0.753,
"step": 500
},
{
"epoch": 0.3627311522048364,
"grad_norm": 6.674903860198922,
"learning_rate": 9.987044976072298e-06,
"loss": 0.7059,
"step": 510
},
{
"epoch": 0.36984352773826457,
"grad_norm": 6.92825266420183,
"learning_rate": 9.983897342062681e-06,
"loss": 0.7308,
"step": 520
},
{
"epoch": 0.37695590327169276,
"grad_norm": 6.802916451931196,
"learning_rate": 9.98040834701761e-06,
"loss": 0.7136,
"step": 530
},
{
"epoch": 0.3840682788051209,
"grad_norm": 5.5147042227151575,
"learning_rate": 9.97657822990809e-06,
"loss": 0.7085,
"step": 540
},
{
"epoch": 0.3911806543385491,
"grad_norm": 6.090876343717448,
"learning_rate": 9.972407253069527e-06,
"loss": 0.7049,
"step": 550
},
{
"epoch": 0.39829302987197723,
"grad_norm": 5.920393451920263,
"learning_rate": 9.967895702183767e-06,
"loss": 0.7136,
"step": 560
},
{
"epoch": 0.40540540540540543,
"grad_norm": 6.740639908684347,
"learning_rate": 9.963043886259518e-06,
"loss": 0.6648,
"step": 570
},
{
"epoch": 0.41251778093883357,
"grad_norm": 6.190326665175998,
"learning_rate": 9.957852137611187e-06,
"loss": 0.6538,
"step": 580
},
{
"epoch": 0.41963015647226176,
"grad_norm": 5.8193938810967865,
"learning_rate": 9.952320811836129e-06,
"loss": 0.5848,
"step": 590
},
{
"epoch": 0.4267425320056899,
"grad_norm": 5.231106852130435,
"learning_rate": 9.94645028779028e-06,
"loss": 0.6117,
"step": 600
},
{
"epoch": 0.43385490753911804,
"grad_norm": 6.42206432654939,
"learning_rate": 9.94024096756221e-06,
"loss": 0.5831,
"step": 610
},
{
"epoch": 0.44096728307254623,
"grad_norm": 7.364483477227342,
"learning_rate": 9.933693276445588e-06,
"loss": 0.5741,
"step": 620
},
{
"epoch": 0.4480796586059744,
"grad_norm": 5.6944684489072355,
"learning_rate": 9.92680766291005e-06,
"loss": 0.5976,
"step": 630
},
{
"epoch": 0.45519203413940257,
"grad_norm": 5.6237891453652455,
"learning_rate": 9.91958459857048e-06,
"loss": 0.5472,
"step": 640
},
{
"epoch": 0.4623044096728307,
"grad_norm": 5.1230807672316505,
"learning_rate": 9.912024578154706e-06,
"loss": 0.5302,
"step": 650
},
{
"epoch": 0.4694167852062589,
"grad_norm": 6.222140354090307,
"learning_rate": 9.904128119469625e-06,
"loss": 0.5292,
"step": 660
},
{
"epoch": 0.47652916073968704,
"grad_norm": 5.82908513723418,
"learning_rate": 9.895895763365722e-06,
"loss": 0.513,
"step": 670
},
{
"epoch": 0.48364153627311524,
"grad_norm": 5.479135634645378,
"learning_rate": 9.88732807370004e-06,
"loss": 0.4549,
"step": 680
},
{
"epoch": 0.4907539118065434,
"grad_norm": 6.604832724403904,
"learning_rate": 9.878425637297549e-06,
"loss": 0.4254,
"step": 690
},
{
"epoch": 0.49786628733997157,
"grad_norm": 5.618060296345201,
"learning_rate": 9.869189063910959e-06,
"loss": 0.4346,
"step": 700
},
{
"epoch": 0.5049786628733998,
"grad_norm": 6.326935885254636,
"learning_rate": 9.859618986178953e-06,
"loss": 0.4311,
"step": 710
},
{
"epoch": 0.5120910384068279,
"grad_norm": 6.411399143629572,
"learning_rate": 9.84971605958286e-06,
"loss": 0.4268,
"step": 720
},
{
"epoch": 0.519203413940256,
"grad_norm": 6.244636888029725,
"learning_rate": 9.839480962401753e-06,
"loss": 0.3943,
"step": 730
},
{
"epoch": 0.5263157894736842,
"grad_norm": 4.9638521174081935,
"learning_rate": 9.828914395665996e-06,
"loss": 0.3704,
"step": 740
},
{
"epoch": 0.5334281650071123,
"grad_norm": 5.557286789154222,
"learning_rate": 9.818017083109233e-06,
"loss": 0.4167,
"step": 750
},
{
"epoch": 0.5405405405405406,
"grad_norm": 5.693084707021369,
"learning_rate": 9.8067897711188e-06,
"loss": 0.3426,
"step": 760
},
{
"epoch": 0.5476529160739687,
"grad_norm": 6.528132775066785,
"learning_rate": 9.795233228684631e-06,
"loss": 0.3325,
"step": 770
},
{
"epoch": 0.5547652916073968,
"grad_norm": 4.9516616181943744,
"learning_rate": 9.783348247346558e-06,
"loss": 0.3526,
"step": 780
},
{
"epoch": 0.561877667140825,
"grad_norm": 5.285013263137629,
"learning_rate": 9.771135641140117e-06,
"loss": 0.2811,
"step": 790
},
{
"epoch": 0.5689900426742532,
"grad_norm": 5.427236432451273,
"learning_rate": 9.758596246540782e-06,
"loss": 0.3714,
"step": 800
},
{
"epoch": 0.5761024182076814,
"grad_norm": 4.2910836634846135,
"learning_rate": 9.74573092240668e-06,
"loss": 0.3484,
"step": 810
},
{
"epoch": 0.5832147937411095,
"grad_norm": 5.462486995732004,
"learning_rate": 9.732540549919758e-06,
"loss": 0.3275,
"step": 820
},
{
"epoch": 0.5903271692745377,
"grad_norm": 5.748493175038991,
"learning_rate": 9.719026032525432e-06,
"loss": 0.3134,
"step": 830
},
{
"epoch": 0.5974395448079659,
"grad_norm": 5.605839009853595,
"learning_rate": 9.70518829587071e-06,
"loss": 0.2918,
"step": 840
},
{
"epoch": 0.604551920341394,
"grad_norm": 4.3649770239167305,
"learning_rate": 9.691028287740783e-06,
"loss": 0.2941,
"step": 850
},
{
"epoch": 0.6116642958748222,
"grad_norm": 6.618364031460492,
"learning_rate": 9.67654697799412e-06,
"loss": 0.3041,
"step": 860
},
{
"epoch": 0.6187766714082503,
"grad_norm": 4.289539717689249,
"learning_rate": 9.661745358496033e-06,
"loss": 0.2691,
"step": 870
},
{
"epoch": 0.6258890469416786,
"grad_norm": 4.403362504112069,
"learning_rate": 9.64662444305074e-06,
"loss": 0.2789,
"step": 880
},
{
"epoch": 0.6330014224751067,
"grad_norm": 5.121206118009773,
"learning_rate": 9.631185267331937e-06,
"loss": 0.2585,
"step": 890
},
{
"epoch": 0.6401137980085349,
"grad_norm": 4.877490307909731,
"learning_rate": 9.615428888811842e-06,
"loss": 0.2499,
"step": 900
},
{
"epoch": 0.647226173541963,
"grad_norm": 5.01492577738939,
"learning_rate": 9.59935638668879e-06,
"loss": 0.2555,
"step": 910
},
{
"epoch": 0.6543385490753911,
"grad_norm": 6.3855818621534555,
"learning_rate": 9.582968861813295e-06,
"loss": 0.2193,
"step": 920
},
{
"epoch": 0.6614509246088194,
"grad_norm": 4.33698276684028,
"learning_rate": 9.566267436612662e-06,
"loss": 0.2533,
"step": 930
},
{
"epoch": 0.6685633001422475,
"grad_norm": 4.685807210281303,
"learning_rate": 9.549253255014105e-06,
"loss": 0.2141,
"step": 940
},
{
"epoch": 0.6756756756756757,
"grad_norm": 5.275159675837076,
"learning_rate": 9.531927482366398e-06,
"loss": 0.2121,
"step": 950
},
{
"epoch": 0.6827880512091038,
"grad_norm": 4.756073343523271,
"learning_rate": 9.514291305360053e-06,
"loss": 0.2187,
"step": 960
},
{
"epoch": 0.689900426742532,
"grad_norm": 5.024551500100791,
"learning_rate": 9.496345931946039e-06,
"loss": 0.1841,
"step": 970
},
{
"epoch": 0.6970128022759602,
"grad_norm": 4.283692003797298,
"learning_rate": 9.47809259125306e-06,
"loss": 0.225,
"step": 980
},
{
"epoch": 0.7041251778093883,
"grad_norm": 5.201732993672687,
"learning_rate": 9.459532533503347e-06,
"loss": 0.184,
"step": 990
},
{
"epoch": 0.7112375533428165,
"grad_norm": 4.806159299085367,
"learning_rate": 9.440667029927043e-06,
"loss": 0.1774,
"step": 1000
},
{
"epoch": 0.7183499288762447,
"grad_norm": 4.391296680269581,
"learning_rate": 9.421497372675133e-06,
"loss": 0.1729,
"step": 1010
},
{
"epoch": 0.7254623044096729,
"grad_norm": 3.4175211871739792,
"learning_rate": 9.402024874730928e-06,
"loss": 0.2066,
"step": 1020
},
{
"epoch": 0.732574679943101,
"grad_norm": 5.917646407329018,
"learning_rate": 9.382250869820146e-06,
"loss": 0.159,
"step": 1030
},
{
"epoch": 0.7396870554765291,
"grad_norm": 4.5654765488178155,
"learning_rate": 9.36217671231956e-06,
"loss": 0.1924,
"step": 1040
},
{
"epoch": 0.7467994310099573,
"grad_norm": 4.148546026699148,
"learning_rate": 9.341803777164228e-06,
"loss": 0.1414,
"step": 1050
},
{
"epoch": 0.7539118065433855,
"grad_norm": 5.299579387506197,
"learning_rate": 9.321133459753322e-06,
"loss": 0.1751,
"step": 1060
},
{
"epoch": 0.7610241820768137,
"grad_norm": 3.9468784531968426,
"learning_rate": 9.300167175854564e-06,
"loss": 0.1642,
"step": 1070
},
{
"epoch": 0.7681365576102418,
"grad_norm": 4.258905236243376,
"learning_rate": 9.278906361507238e-06,
"loss": 0.1362,
"step": 1080
},
{
"epoch": 0.7752489331436699,
"grad_norm": 4.177714057989964,
"learning_rate": 9.257352472923842e-06,
"loss": 0.1375,
"step": 1090
},
{
"epoch": 0.7823613086770982,
"grad_norm": 4.631019052366409,
"learning_rate": 9.235506986390346e-06,
"loss": 0.1282,
"step": 1100
},
{
"epoch": 0.7894736842105263,
"grad_norm": 4.025666581154786,
"learning_rate": 9.213371398165077e-06,
"loss": 0.1341,
"step": 1110
},
{
"epoch": 0.7965860597439545,
"grad_norm": 5.312761237015971,
"learning_rate": 9.190947224376238e-06,
"loss": 0.1624,
"step": 1120
},
{
"epoch": 0.8036984352773826,
"grad_norm": 4.847580150934388,
"learning_rate": 9.168236000918063e-06,
"loss": 0.1228,
"step": 1130
},
{
"epoch": 0.8108108108108109,
"grad_norm": 3.681304323084205,
"learning_rate": 9.145239283345618e-06,
"loss": 0.1295,
"step": 1140
},
{
"epoch": 0.817923186344239,
"grad_norm": 2.856809444870011,
"learning_rate": 9.121958646768251e-06,
"loss": 0.1239,
"step": 1150
},
{
"epoch": 0.8250355618776671,
"grad_norm": 3.8759882864356574,
"learning_rate": 9.09839568574173e-06,
"loss": 0.1058,
"step": 1160
},
{
"epoch": 0.8321479374110953,
"grad_norm": 3.520732889394545,
"learning_rate": 9.074552014158994e-06,
"loss": 0.1085,
"step": 1170
},
{
"epoch": 0.8392603129445235,
"grad_norm": 3.343174269734267,
"learning_rate": 9.050429265139647e-06,
"loss": 0.1158,
"step": 1180
},
{
"epoch": 0.8463726884779517,
"grad_norm": 4.5226917893573475,
"learning_rate": 9.026029090918076e-06,
"loss": 0.1222,
"step": 1190
},
{
"epoch": 0.8534850640113798,
"grad_norm": 3.7945492207672564,
"learning_rate": 9.001353162730297e-06,
"loss": 0.1112,
"step": 1200
},
{
"epoch": 0.8605974395448079,
"grad_norm": 4.30755037317223,
"learning_rate": 8.976403170699486e-06,
"loss": 0.1065,
"step": 1210
},
{
"epoch": 0.8677098150782361,
"grad_norm": 2.8075717567217033,
"learning_rate": 8.951180823720212e-06,
"loss": 0.0936,
"step": 1220
},
{
"epoch": 0.8748221906116643,
"grad_norm": 4.173272966405032,
"learning_rate": 8.925687849341398e-06,
"loss": 0.0794,
"step": 1230
},
{
"epoch": 0.8819345661450925,
"grad_norm": 3.41589130381316,
"learning_rate": 8.899925993647994e-06,
"loss": 0.0897,
"step": 1240
},
{
"epoch": 0.8890469416785206,
"grad_norm": 4.346956823329345,
"learning_rate": 8.873897021141378e-06,
"loss": 0.0887,
"step": 1250
},
{
"epoch": 0.8961593172119487,
"grad_norm": 3.51891817822506,
"learning_rate": 8.847602714618504e-06,
"loss": 0.0846,
"step": 1260
},
{
"epoch": 0.903271692745377,
"grad_norm": 3.106657795276442,
"learning_rate": 8.821044875049796e-06,
"loss": 0.0883,
"step": 1270
},
{
"epoch": 0.9103840682788051,
"grad_norm": 3.469975395363045,
"learning_rate": 8.794225321455788e-06,
"loss": 0.0893,
"step": 1280
},
{
"epoch": 0.9174964438122333,
"grad_norm": 4.092170025970377,
"learning_rate": 8.767145890782542e-06,
"loss": 0.0912,
"step": 1290
},
{
"epoch": 0.9246088193456614,
"grad_norm": 3.970701793851334,
"learning_rate": 8.739808437775825e-06,
"loss": 0.0759,
"step": 1300
},
{
"epoch": 0.9317211948790897,
"grad_norm": 4.338458399102716,
"learning_rate": 8.71221483485407e-06,
"loss": 0.0938,
"step": 1310
},
{
"epoch": 0.9388335704125178,
"grad_norm": 3.4360396182886963,
"learning_rate": 8.684366971980139e-06,
"loss": 0.0804,
"step": 1320
},
{
"epoch": 0.9459459459459459,
"grad_norm": 3.3921438457695654,
"learning_rate": 8.656266756531857e-06,
"loss": 0.0734,
"step": 1330
},
{
"epoch": 0.9530583214793741,
"grad_norm": 3.6703934299551126,
"learning_rate": 8.627916113171396e-06,
"loss": 0.0794,
"step": 1340
},
{
"epoch": 0.9601706970128022,
"grad_norm": 3.0528739658231823,
"learning_rate": 8.599316983713419e-06,
"loss": 0.0729,
"step": 1350
},
{
"epoch": 0.9672830725462305,
"grad_norm": 3.205301205746107,
"learning_rate": 8.570471326992105e-06,
"loss": 0.0776,
"step": 1360
},
{
"epoch": 0.9743954480796586,
"grad_norm": 3.39755647080461,
"learning_rate": 8.54138111872697e-06,
"loss": 0.0801,
"step": 1370
},
{
"epoch": 0.9815078236130867,
"grad_norm": 2.5637363679350056,
"learning_rate": 8.512048351387551e-06,
"loss": 0.0698,
"step": 1380
},
{
"epoch": 0.9886201991465149,
"grad_norm": 2.5368692351882367,
"learning_rate": 8.482475034056927e-06,
"loss": 0.0719,
"step": 1390
},
{
"epoch": 0.9957325746799431,
"grad_norm": 2.729862342028922,
"learning_rate": 8.452663192294121e-06,
"loss": 0.0704,
"step": 1400
},
{
"epoch": 1.0028449502133712,
"grad_norm": 3.2434913299704524,
"learning_rate": 8.42261486799536e-06,
"loss": 0.0619,
"step": 1410
},
{
"epoch": 1.0099573257467995,
"grad_norm": 2.7769965300793844,
"learning_rate": 8.392332119254214e-06,
"loss": 0.0498,
"step": 1420
},
{
"epoch": 1.0170697012802277,
"grad_norm": 2.824241471905449,
"learning_rate": 8.361817020220647e-06,
"loss": 0.0468,
"step": 1430
},
{
"epoch": 1.0241820768136558,
"grad_norm": 2.9150859408507794,
"learning_rate": 8.331071660958936e-06,
"loss": 0.0556,
"step": 1440
},
{
"epoch": 1.031294452347084,
"grad_norm": 1.907604048350405,
"learning_rate": 8.300098147304523e-06,
"loss": 0.047,
"step": 1450
},
{
"epoch": 1.038406827880512,
"grad_norm": 3.404202261022711,
"learning_rate": 8.268898600719785e-06,
"loss": 0.0553,
"step": 1460
},
{
"epoch": 1.0455192034139402,
"grad_norm": 3.931120264062831,
"learning_rate": 8.237475158148724e-06,
"loss": 0.0536,
"step": 1470
},
{
"epoch": 1.0526315789473684,
"grad_norm": 3.1620958675587403,
"learning_rate": 8.205829971870602e-06,
"loss": 0.0479,
"step": 1480
},
{
"epoch": 1.0597439544807965,
"grad_norm": 2.7429804696608935,
"learning_rate": 8.173965209352524e-06,
"loss": 0.048,
"step": 1490
},
{
"epoch": 1.0668563300142249,
"grad_norm": 3.6303992768842437,
"learning_rate": 8.14188305310099e-06,
"loss": 0.0582,
"step": 1500
},
{
"epoch": 1.073968705547653,
"grad_norm": 3.293723594586508,
"learning_rate": 8.109585700512395e-06,
"loss": 0.0507,
"step": 1510
},
{
"epoch": 1.0810810810810811,
"grad_norm": 3.4841344851942755,
"learning_rate": 8.077075363722542e-06,
"loss": 0.0492,
"step": 1520
},
{
"epoch": 1.0881934566145093,
"grad_norm": 3.7536302337999277,
"learning_rate": 8.044354269455109e-06,
"loss": 0.059,
"step": 1530
},
{
"epoch": 1.0953058321479374,
"grad_norm": 3.261925658312736,
"learning_rate": 8.011424658869142e-06,
"loss": 0.0496,
"step": 1540
},
{
"epoch": 1.1024182076813656,
"grad_norm": 3.2649673226199503,
"learning_rate": 7.978288787405556e-06,
"loss": 0.0428,
"step": 1550
},
{
"epoch": 1.1095305832147937,
"grad_norm": 3.1908862790356185,
"learning_rate": 7.944948924632643e-06,
"loss": 0.0484,
"step": 1560
},
{
"epoch": 1.1166429587482218,
"grad_norm": 2.980091794093968,
"learning_rate": 7.911407354090634e-06,
"loss": 0.0485,
"step": 1570
},
{
"epoch": 1.12375533428165,
"grad_norm": 2.6098085172978287,
"learning_rate": 7.877666373135287e-06,
"loss": 0.0441,
"step": 1580
},
{
"epoch": 1.1308677098150781,
"grad_norm": 2.6329214221965485,
"learning_rate": 7.84372829278053e-06,
"loss": 0.052,
"step": 1590
},
{
"epoch": 1.1379800853485065,
"grad_norm": 2.9828411095241862,
"learning_rate": 7.809595437540189e-06,
"loss": 0.0444,
"step": 1600
},
{
"epoch": 1.1450924608819346,
"grad_norm": 2.8256209003189183,
"learning_rate": 7.775270145268755e-06,
"loss": 0.0405,
"step": 1610
},
{
"epoch": 1.1522048364153628,
"grad_norm": 2.1852050342228053,
"learning_rate": 7.740754767001278e-06,
"loss": 0.0443,
"step": 1620
},
{
"epoch": 1.159317211948791,
"grad_norm": 2.9459065182368223,
"learning_rate": 7.706051666792318e-06,
"loss": 0.044,
"step": 1630
},
{
"epoch": 1.166429587482219,
"grad_norm": 1.8756599971066235,
"learning_rate": 7.671163221554043e-06,
"loss": 0.0443,
"step": 1640
},
{
"epoch": 1.1735419630156472,
"grad_norm": 2.546613866229381,
"learning_rate": 7.636091820893417e-06,
"loss": 0.0485,
"step": 1650
},
{
"epoch": 1.1806543385490753,
"grad_norm": 2.6687368539976215,
"learning_rate": 7.600839866948528e-06,
"loss": 0.0479,
"step": 1660
},
{
"epoch": 1.1877667140825037,
"grad_norm": 3.1352891064330235,
"learning_rate": 7.565409774224066e-06,
"loss": 0.0478,
"step": 1670
},
{
"epoch": 1.1948790896159318,
"grad_norm": 2.840525979100908,
"learning_rate": 7.529803969425941e-06,
"loss": 0.0402,
"step": 1680
},
{
"epoch": 1.20199146514936,
"grad_norm": 2.6839166671210015,
"learning_rate": 7.494024891295075e-06,
"loss": 0.0456,
"step": 1690
},
{
"epoch": 1.209103840682788,
"grad_norm": 2.5400915247833513,
"learning_rate": 7.458074990440363e-06,
"loss": 0.0399,
"step": 1700
},
{
"epoch": 1.2162162162162162,
"grad_norm": 2.4640207101809923,
"learning_rate": 7.421956729170823e-06,
"loss": 0.0408,
"step": 1710
},
{
"epoch": 1.2233285917496444,
"grad_norm": 2.9057910772118296,
"learning_rate": 7.385672581326954e-06,
"loss": 0.0532,
"step": 1720
},
{
"epoch": 1.2304409672830725,
"grad_norm": 2.2101614324462004,
"learning_rate": 7.34922503211128e-06,
"loss": 0.0412,
"step": 1730
},
{
"epoch": 1.2375533428165006,
"grad_norm": 2.778141590898155,
"learning_rate": 7.312616577918149e-06,
"loss": 0.0383,
"step": 1740
},
{
"epoch": 1.2446657183499288,
"grad_norm": 1.9995919460370448,
"learning_rate": 7.2758497261627345e-06,
"loss": 0.0357,
"step": 1750
},
{
"epoch": 1.251778093883357,
"grad_norm": 2.9469506417904725,
"learning_rate": 7.238926995109306e-06,
"loss": 0.0421,
"step": 1760
},
{
"epoch": 1.2588904694167853,
"grad_norm": 2.916214854110651,
"learning_rate": 7.201850913698736e-06,
"loss": 0.0433,
"step": 1770
},
{
"epoch": 1.2660028449502134,
"grad_norm": 2.4356023661651935,
"learning_rate": 7.164624021375294e-06,
"loss": 0.0381,
"step": 1780
},
{
"epoch": 1.2731152204836416,
"grad_norm": 2.9191479958429705,
"learning_rate": 7.12724886791271e-06,
"loss": 0.0347,
"step": 1790
},
{
"epoch": 1.2802275960170697,
"grad_norm": 2.7350905980602964,
"learning_rate": 7.08972801323953e-06,
"loss": 0.042,
"step": 1800
},
{
"epoch": 1.2873399715504978,
"grad_norm": 1.8638175145049911,
"learning_rate": 7.052064027263785e-06,
"loss": 0.0358,
"step": 1810
},
{
"epoch": 1.294452347083926,
"grad_norm": 2.518215951733799,
"learning_rate": 7.014259489696968e-06,
"loss": 0.0313,
"step": 1820
},
{
"epoch": 1.3015647226173541,
"grad_norm": 2.7073773489335435,
"learning_rate": 6.976316989877343e-06,
"loss": 0.0315,
"step": 1830
},
{
"epoch": 1.3086770981507825,
"grad_norm": 2.1307550105530932,
"learning_rate": 6.938239126592592e-06,
"loss": 0.0305,
"step": 1840
},
{
"epoch": 1.3157894736842106,
"grad_norm": 2.5312376788572957,
"learning_rate": 6.90002850790182e-06,
"loss": 0.0441,
"step": 1850
},
{
"epoch": 1.3229018492176388,
"grad_norm": 2.5163550627582523,
"learning_rate": 6.861687750956922e-06,
"loss": 0.0363,
"step": 1860
},
{
"epoch": 1.330014224751067,
"grad_norm": 2.121254806907021,
"learning_rate": 6.823219481823318e-06,
"loss": 0.035,
"step": 1870
},
{
"epoch": 1.337126600284495,
"grad_norm": 1.6219365468690443,
"learning_rate": 6.784626335300102e-06,
"loss": 0.0211,
"step": 1880
},
{
"epoch": 1.3442389758179232,
"grad_norm": 2.53112733146716,
"learning_rate": 6.745910954739563e-06,
"loss": 0.0368,
"step": 1890
},
{
"epoch": 1.3513513513513513,
"grad_norm": 2.262886209479993,
"learning_rate": 6.707075991866143e-06,
"loss": 0.0337,
"step": 1900
},
{
"epoch": 1.3584637268847795,
"grad_norm": 2.2016920030917126,
"learning_rate": 6.668124106594813e-06,
"loss": 0.0323,
"step": 1910
},
{
"epoch": 1.3655761024182076,
"grad_norm": 2.503481715381616,
"learning_rate": 6.629057966848879e-06,
"loss": 0.0374,
"step": 1920
},
{
"epoch": 1.3726884779516357,
"grad_norm": 2.282735039659182,
"learning_rate": 6.589880248377258e-06,
"loss": 0.0293,
"step": 1930
},
{
"epoch": 1.379800853485064,
"grad_norm": 2.2077630603832223,
"learning_rate": 6.550593634571205e-06,
"loss": 0.0326,
"step": 1940
},
{
"epoch": 1.3869132290184922,
"grad_norm": 2.4581246568090167,
"learning_rate": 6.511200816280523e-06,
"loss": 0.0279,
"step": 1950
},
{
"epoch": 1.3940256045519204,
"grad_norm": 1.9826120506712706,
"learning_rate": 6.471704491629251e-06,
"loss": 0.0348,
"step": 1960
},
{
"epoch": 1.4011379800853485,
"grad_norm": 2.0571244476045187,
"learning_rate": 6.432107365830872e-06,
"loss": 0.0303,
"step": 1970
},
{
"epoch": 1.4082503556187767,
"grad_norm": 2.5505998895912594,
"learning_rate": 6.392412151003019e-06,
"loss": 0.0379,
"step": 1980
},
{
"epoch": 1.4153627311522048,
"grad_norm": 2.764733651901472,
"learning_rate": 6.3526215659817156e-06,
"loss": 0.0346,
"step": 1990
},
{
"epoch": 1.422475106685633,
"grad_norm": 2.093793772844917,
"learning_rate": 6.312738336135159e-06,
"loss": 0.0277,
"step": 2000
},
{
"epoch": 1.4295874822190613,
"grad_norm": 2.36946856664872,
"learning_rate": 6.272765193177044e-06,
"loss": 0.0272,
"step": 2010
},
{
"epoch": 1.4366998577524894,
"grad_norm": 1.6148755839195088,
"learning_rate": 6.23270487497947e-06,
"loss": 0.0286,
"step": 2020
},
{
"epoch": 1.4438122332859176,
"grad_norm": 2.2679584601673284,
"learning_rate": 6.192560125385412e-06,
"loss": 0.0267,
"step": 2030
},
{
"epoch": 1.4509246088193457,
"grad_norm": 2.0895836816165927,
"learning_rate": 6.152333694020781e-06,
"loss": 0.0263,
"step": 2040
},
{
"epoch": 1.4580369843527738,
"grad_norm": 2.2843465595467625,
"learning_rate": 6.112028336106108e-06,
"loss": 0.0328,
"step": 2050
},
{
"epoch": 1.465149359886202,
"grad_norm": 2.223353990512444,
"learning_rate": 6.071646812267817e-06,
"loss": 0.0268,
"step": 2060
},
{
"epoch": 1.4722617354196301,
"grad_norm": 3.2498269359064613,
"learning_rate": 6.031191888349155e-06,
"loss": 0.0272,
"step": 2070
},
{
"epoch": 1.4793741109530583,
"grad_norm": 2.1334409386725577,
"learning_rate": 5.990666335220738e-06,
"loss": 0.027,
"step": 2080
},
{
"epoch": 1.4864864864864864,
"grad_norm": 1.1837205419905095,
"learning_rate": 5.950072928590781e-06,
"loss": 0.024,
"step": 2090
},
{
"epoch": 1.4935988620199145,
"grad_norm": 2.3246109475669043,
"learning_rate": 5.909414448814971e-06,
"loss": 0.0254,
"step": 2100
},
{
"epoch": 1.5007112375533427,
"grad_norm": 2.172355872063245,
"learning_rate": 5.8686936807060335e-06,
"loss": 0.0275,
"step": 2110
},
{
"epoch": 1.5078236130867708,
"grad_norm": 2.0037575574912725,
"learning_rate": 5.827913413343003e-06,
"loss": 0.0268,
"step": 2120
},
{
"epoch": 1.5149359886201992,
"grad_norm": 2.052997877430725,
"learning_rate": 5.787076439880177e-06,
"loss": 0.0265,
"step": 2130
},
{
"epoch": 1.5220483641536273,
"grad_norm": 2.6412414518353535,
"learning_rate": 5.746185557355814e-06,
"loss": 0.0254,
"step": 2140
},
{
"epoch": 1.5291607396870555,
"grad_norm": 2.037688763456784,
"learning_rate": 5.70524356650056e-06,
"loss": 0.022,
"step": 2150
},
{
"epoch": 1.5362731152204836,
"grad_norm": 1.8060364636807138,
"learning_rate": 5.664253271545603e-06,
"loss": 0.0196,
"step": 2160
},
{
"epoch": 1.543385490753912,
"grad_norm": 2.8659848398210994,
"learning_rate": 5.623217480030622e-06,
"loss": 0.0225,
"step": 2170
},
{
"epoch": 1.55049786628734,
"grad_norm": 1.7576344300050144,
"learning_rate": 5.58213900261148e-06,
"loss": 0.0209,
"step": 2180
},
{
"epoch": 1.5576102418207682,
"grad_norm": 2.4388617988676353,
"learning_rate": 5.541020652867713e-06,
"loss": 0.0272,
"step": 2190
},
{
"epoch": 1.5647226173541964,
"grad_norm": 1.8297232039252016,
"learning_rate": 5.49986524710983e-06,
"loss": 0.022,
"step": 2200
},
{
"epoch": 1.5718349928876245,
"grad_norm": 3.07701649104599,
"learning_rate": 5.4586756041864065e-06,
"loss": 0.0226,
"step": 2210
},
{
"epoch": 1.5789473684210527,
"grad_norm": 2.0923369231599502,
"learning_rate": 5.417454545291017e-06,
"loss": 0.0267,
"step": 2220
},
{
"epoch": 1.5860597439544808,
"grad_norm": 1.7648644786729606,
"learning_rate": 5.376204893769e-06,
"loss": 0.0226,
"step": 2230
},
{
"epoch": 1.593172119487909,
"grad_norm": 2.7349839796479705,
"learning_rate": 5.334929474924093e-06,
"loss": 0.0241,
"step": 2240
},
{
"epoch": 1.600284495021337,
"grad_norm": 1.220625942007617,
"learning_rate": 5.293631115824897e-06,
"loss": 0.0229,
"step": 2250
},
{
"epoch": 1.6073968705547652,
"grad_norm": 2.048027464288667,
"learning_rate": 5.252312645111266e-06,
"loss": 0.0244,
"step": 2260
},
{
"epoch": 1.6145092460881934,
"grad_norm": 1.807589998784463,
"learning_rate": 5.2109768928005454e-06,
"loss": 0.0186,
"step": 2270
},
{
"epoch": 1.6216216216216215,
"grad_norm": 1.8200396071062714,
"learning_rate": 5.169626690093751e-06,
"loss": 0.0217,
"step": 2280
},
{
"epoch": 1.6287339971550496,
"grad_norm": 3.162567751879996,
"learning_rate": 5.128264869181646e-06,
"loss": 0.0214,
"step": 2290
},
{
"epoch": 1.635846372688478,
"grad_norm": 1.819589839229396,
"learning_rate": 5.086894263050755e-06,
"loss": 0.0199,
"step": 2300
},
{
"epoch": 1.6429587482219061,
"grad_norm": 2.432299659362288,
"learning_rate": 5.045517705289328e-06,
"loss": 0.0184,
"step": 2310
},
{
"epoch": 1.6500711237553343,
"grad_norm": 1.3210052417654967,
"learning_rate": 5.004138029893257e-06,
"loss": 0.0189,
"step": 2320
},
{
"epoch": 1.6571834992887624,
"grad_norm": 2.1820820539792005,
"learning_rate": 4.9627580710719734e-06,
"loss": 0.0184,
"step": 2330
},
{
"epoch": 1.6642958748221908,
"grad_norm": 1.824213025460573,
"learning_rate": 4.921380663054318e-06,
"loss": 0.0195,
"step": 2340
},
{
"epoch": 1.671408250355619,
"grad_norm": 1.5395595832036817,
"learning_rate": 4.880008639894421e-06,
"loss": 0.0182,
"step": 2350
},
{
"epoch": 1.678520625889047,
"grad_norm": 2.1765686448734898,
"learning_rate": 4.838644835277585e-06,
"loss": 0.0216,
"step": 2360
},
{
"epoch": 1.6856330014224752,
"grad_norm": 1.6870995721298556,
"learning_rate": 4.79729208232621e-06,
"loss": 0.0172,
"step": 2370
},
{
"epoch": 1.6927453769559033,
"grad_norm": 0.7966147079317253,
"learning_rate": 4.75595321340573e-06,
"loss": 0.0115,
"step": 2380
},
{
"epoch": 1.6998577524893315,
"grad_norm": 2.172012245299623,
"learning_rate": 4.714631059930622e-06,
"loss": 0.0161,
"step": 2390
},
{
"epoch": 1.7069701280227596,
"grad_norm": 1.6833896185427781,
"learning_rate": 4.6733284521704816e-06,
"loss": 0.0184,
"step": 2400
},
{
"epoch": 1.7140825035561877,
"grad_norm": 1.943733582718135,
"learning_rate": 4.632048219056159e-06,
"loss": 0.0201,
"step": 2410
},
{
"epoch": 1.7211948790896159,
"grad_norm": 0.9592772600888013,
"learning_rate": 4.590793187986003e-06,
"loss": 0.0154,
"step": 2420
},
{
"epoch": 1.728307254623044,
"grad_norm": 2.418073085794697,
"learning_rate": 4.549566184632206e-06,
"loss": 0.0182,
"step": 2430
},
{
"epoch": 1.7354196301564722,
"grad_norm": 2.213535114167178,
"learning_rate": 4.508370032747261e-06,
"loss": 0.0162,
"step": 2440
},
{
"epoch": 1.7425320056899003,
"grad_norm": 1.4082214012150884,
"learning_rate": 4.467207553970564e-06,
"loss": 0.017,
"step": 2450
},
{
"epoch": 1.7496443812233284,
"grad_norm": 1.4580878754754951,
"learning_rate": 4.426081567635137e-06,
"loss": 0.0144,
"step": 2460
},
{
"epoch": 1.7567567567567568,
"grad_norm": 1.2997851819162691,
"learning_rate": 4.3849948905745385e-06,
"loss": 0.014,
"step": 2470
},
{
"epoch": 1.763869132290185,
"grad_norm": 1.6936376446584087,
"learning_rate": 4.343950336929927e-06,
"loss": 0.015,
"step": 2480
},
{
"epoch": 1.770981507823613,
"grad_norm": 0.9659927368936605,
"learning_rate": 4.302950717957304e-06,
"loss": 0.0144,
"step": 2490
},
{
"epoch": 1.7780938833570412,
"grad_norm": 0.863368492950506,
"learning_rate": 4.261998841834972e-06,
"loss": 0.0144,
"step": 2500
},
{
"epoch": 1.7852062588904696,
"grad_norm": 1.173314517778269,
"learning_rate": 4.221097513471199e-06,
"loss": 0.0114,
"step": 2510
},
{
"epoch": 1.7923186344238977,
"grad_norm": 1.2332466621958023,
"learning_rate": 4.18024953431209e-06,
"loss": 0.0127,
"step": 2520
},
{
"epoch": 1.7994310099573259,
"grad_norm": 1.1682909598329931,
"learning_rate": 4.13945770214971e-06,
"loss": 0.0149,
"step": 2530
},
{
"epoch": 1.806543385490754,
"grad_norm": 1.291108254245667,
"learning_rate": 4.098724810930472e-06,
"loss": 0.0129,
"step": 2540
},
{
"epoch": 1.8136557610241821,
"grad_norm": 0.8886518405396012,
"learning_rate": 4.058053650563747e-06,
"loss": 0.0124,
"step": 2550
},
{
"epoch": 1.8207681365576103,
"grad_norm": 1.9480002932590483,
"learning_rate": 4.017447006730796e-06,
"loss": 0.0139,
"step": 2560
},
{
"epoch": 1.8278805120910384,
"grad_norm": 1.6270472010472303,
"learning_rate": 3.976907660693954e-06,
"loss": 0.0128,
"step": 2570
},
{
"epoch": 1.8349928876244666,
"grad_norm": 0.7994173120546909,
"learning_rate": 3.936438389106154e-06,
"loss": 0.0154,
"step": 2580
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.0657933268821647,
"learning_rate": 3.896041963820724e-06,
"loss": 0.0162,
"step": 2590
},
{
"epoch": 1.8492176386913228,
"grad_norm": 1.37634738377725,
"learning_rate": 3.855721151701548e-06,
"loss": 0.0131,
"step": 2600
},
{
"epoch": 1.856330014224751,
"grad_norm": 1.7544039532579758,
"learning_rate": 3.815478714433559e-06,
"loss": 0.0129,
"step": 2610
},
{
"epoch": 1.863442389758179,
"grad_norm": 0.8340685854059343,
"learning_rate": 3.775317408333571e-06,
"loss": 0.0153,
"step": 2620
},
{
"epoch": 1.8705547652916072,
"grad_norm": 0.5628132394579408,
"learning_rate": 3.7352399841614996e-06,
"loss": 0.0116,
"step": 2630
},
{
"epoch": 1.8776671408250356,
"grad_norm": 0.6225982099610957,
"learning_rate": 3.695249186931954e-06,
"loss": 0.0126,
"step": 2640
},
{
"epoch": 1.8847795163584637,
"grad_norm": 1.3019140739354105,
"learning_rate": 3.655347755726224e-06,
"loss": 0.0111,
"step": 2650
},
{
"epoch": 1.8918918918918919,
"grad_norm": 1.8784232456646273,
"learning_rate": 3.6155384235046674e-06,
"loss": 0.0129,
"step": 2660
},
{
"epoch": 1.89900426742532,
"grad_norm": 1.2419343143332369,
"learning_rate": 3.5758239169195276e-06,
"loss": 0.0079,
"step": 2670
},
{
"epoch": 1.9061166429587484,
"grad_norm": 1.327768382551478,
"learning_rate": 3.5362069561281764e-06,
"loss": 0.0103,
"step": 2680
},
{
"epoch": 1.9132290184921765,
"grad_norm": 1.9613323508306293,
"learning_rate": 3.4966902546068016e-06,
"loss": 0.0127,
"step": 2690
},
{
"epoch": 1.9203413940256047,
"grad_norm": 1.6143025584170387,
"learning_rate": 3.4572765189645516e-06,
"loss": 0.0101,
"step": 2700
},
{
"epoch": 1.9274537695590328,
"grad_norm": 1.616056876993894,
"learning_rate": 3.4179684487581555e-06,
"loss": 0.0137,
"step": 2710
},
{
"epoch": 1.934566145092461,
"grad_norm": 0.8562390776041625,
"learning_rate": 3.3787687363070256e-06,
"loss": 0.0109,
"step": 2720
},
{
"epoch": 1.941678520625889,
"grad_norm": 1.1375907109658978,
"learning_rate": 3.3396800665088435e-06,
"loss": 0.0105,
"step": 2730
},
{
"epoch": 1.9487908961593172,
"grad_norm": 1.4045001817445895,
"learning_rate": 3.300705116655672e-06,
"loss": 0.0092,
"step": 2740
},
{
"epoch": 1.9559032716927454,
"grad_norm": 1.3938214602230523,
"learning_rate": 3.26184655625058e-06,
"loss": 0.0074,
"step": 2750
},
{
"epoch": 1.9630156472261735,
"grad_norm": 1.125496278925233,
"learning_rate": 3.2231070468247954e-06,
"loss": 0.0108,
"step": 2760
},
{
"epoch": 1.9701280227596016,
"grad_norm": 1.0938627450279363,
"learning_rate": 3.1844892417554102e-06,
"loss": 0.0085,
"step": 2770
},
{
"epoch": 1.9772403982930298,
"grad_norm": 1.0213367474449344,
"learning_rate": 3.1459957860836528e-06,
"loss": 0.0109,
"step": 2780
},
{
"epoch": 1.984352773826458,
"grad_norm": 1.4919999851950547,
"learning_rate": 3.1076293163337074e-06,
"loss": 0.0098,
"step": 2790
},
{
"epoch": 1.991465149359886,
"grad_norm": 2.7046778198212404,
"learning_rate": 3.069392460332141e-06,
"loss": 0.0125,
"step": 2800
},
{
"epoch": 1.9985775248933144,
"grad_norm": 0.7341284079698729,
"learning_rate": 3.031287837027911e-06,
"loss": 0.0086,
"step": 2810
},
{
"epoch": 2.0056899004267423,
"grad_norm": 1.5705489322569974,
"learning_rate": 2.9933180563129936e-06,
"loss": 0.0061,
"step": 2820
},
{
"epoch": 2.012802275960171,
"grad_norm": 0.31350772488690454,
"learning_rate": 2.955485718843616e-06,
"loss": 0.0074,
"step": 2830
},
{
"epoch": 2.019914651493599,
"grad_norm": 0.6018726291077504,
"learning_rate": 2.917793415862129e-06,
"loss": 0.0082,
"step": 2840
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.6809782196576207,
"learning_rate": 2.880243729019546e-06,
"loss": 0.005,
"step": 2850
},
{
"epoch": 2.0341394025604553,
"grad_norm": 0.520827132094845,
"learning_rate": 2.842839230198685e-06,
"loss": 0.0072,
"step": 2860
},
{
"epoch": 2.0412517780938835,
"grad_norm": 1.1302843148069386,
"learning_rate": 2.805582481338044e-06,
"loss": 0.008,
"step": 2870
},
{
"epoch": 2.0483641536273116,
"grad_norm": 0.3587179831514039,
"learning_rate": 2.7684760342563045e-06,
"loss": 0.0063,
"step": 2880
},
{
"epoch": 2.0554765291607398,
"grad_norm": 1.116585578193006,
"learning_rate": 2.731522430477571e-06,
"loss": 0.0066,
"step": 2890
},
{
"epoch": 2.062588904694168,
"grad_norm": 0.9083509287470979,
"learning_rate": 2.694724201057273e-06,
"loss": 0.0087,
"step": 2900
},
{
"epoch": 2.069701280227596,
"grad_norm": 0.6955128987380951,
"learning_rate": 2.6580838664088214e-06,
"loss": 0.0083,
"step": 2910
},
{
"epoch": 2.076813655761024,
"grad_norm": 2.217810880330465,
"learning_rate": 2.6216039361309753e-06,
"loss": 0.0084,
"step": 2920
},
{
"epoch": 2.0839260312944523,
"grad_norm": 0.833769806548182,
"learning_rate": 2.5852869088359495e-06,
"loss": 0.0066,
"step": 2930
},
{
"epoch": 2.0910384068278804,
"grad_norm": 0.7464675150972822,
"learning_rate": 2.549135271978275e-06,
"loss": 0.0051,
"step": 2940
},
{
"epoch": 2.0981507823613086,
"grad_norm": 0.2874640695250341,
"learning_rate": 2.5131515016844345e-06,
"loss": 0.0094,
"step": 2950
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.5951223484484282,
"learning_rate": 2.4773380625832603e-06,
"loss": 0.0073,
"step": 2960
},
{
"epoch": 2.112375533428165,
"grad_norm": 0.8617405032410067,
"learning_rate": 2.4416974076371304e-06,
"loss": 0.0063,
"step": 2970
},
{
"epoch": 2.119487908961593,
"grad_norm": 0.20697744879238947,
"learning_rate": 2.406231977973942e-06,
"loss": 0.0061,
"step": 2980
},
{
"epoch": 2.126600284495021,
"grad_norm": 1.0179575970597508,
"learning_rate": 2.3709442027199387e-06,
"loss": 0.0091,
"step": 2990
},
{
"epoch": 2.1337126600284497,
"grad_norm": 1.0606767583952494,
"learning_rate": 2.3358364988333066e-06,
"loss": 0.0063,
"step": 3000
},
{
"epoch": 2.140825035561878,
"grad_norm": 1.2307112817732728,
"learning_rate": 2.3009112709386454e-06,
"loss": 0.0072,
"step": 3010
},
{
"epoch": 2.147937411095306,
"grad_norm": 1.1984078867472323,
"learning_rate": 2.2661709111622666e-06,
"loss": 0.0074,
"step": 3020
},
{
"epoch": 2.155049786628734,
"grad_norm": 0.1995779664837118,
"learning_rate": 2.2316177989683458e-06,
"loss": 0.0048,
"step": 3030
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.9273305310197127,
"learning_rate": 2.197254300995953e-06,
"loss": 0.0072,
"step": 3040
},
{
"epoch": 2.1692745376955904,
"grad_norm": 0.228767885995747,
"learning_rate": 2.163082770896943e-06,
"loss": 0.006,
"step": 3050
},
{
"epoch": 2.1763869132290186,
"grad_norm": 1.1353632110102576,
"learning_rate": 2.1291055491747643e-06,
"loss": 0.0067,
"step": 3060
},
{
"epoch": 2.1834992887624467,
"grad_norm": 0.909003973479003,
"learning_rate": 2.095324963024137e-06,
"loss": 0.0063,
"step": 3070
},
{
"epoch": 2.190611664295875,
"grad_norm": 0.8569150241839785,
"learning_rate": 2.061743326171668e-06,
"loss": 0.0047,
"step": 3080
},
{
"epoch": 2.197724039829303,
"grad_norm": 0.5078332905539804,
"learning_rate": 2.02836293871736e-06,
"loss": 0.0076,
"step": 3090
},
{
"epoch": 2.204836415362731,
"grad_norm": 0.206100019331658,
"learning_rate": 1.9951860869771e-06,
"loss": 0.0051,
"step": 3100
},
{
"epoch": 2.2119487908961593,
"grad_norm": 0.34329585167252463,
"learning_rate": 1.962215043326029e-06,
"loss": 0.0058,
"step": 3110
},
{
"epoch": 2.2190611664295874,
"grad_norm": 0.9524416758915172,
"learning_rate": 1.9294520660429284e-06,
"loss": 0.0057,
"step": 3120
},
{
"epoch": 2.2261735419630155,
"grad_norm": 0.2901777267550113,
"learning_rate": 1.8968993991555301e-06,
"loss": 0.0077,
"step": 3130
},
{
"epoch": 2.2332859174964437,
"grad_norm": 0.22017782629955687,
"learning_rate": 1.8645592722868223e-06,
"loss": 0.0063,
"step": 3140
},
{
"epoch": 2.240398293029872,
"grad_norm": 0.38133062392097455,
"learning_rate": 1.8324339005023273e-06,
"loss": 0.0072,
"step": 3150
},
{
"epoch": 2.2475106685633,
"grad_norm": 0.17821853650446573,
"learning_rate": 1.8005254841584035e-06,
"loss": 0.0053,
"step": 3160
},
{
"epoch": 2.2546230440967285,
"grad_norm": 0.22314200324222835,
"learning_rate": 1.768836208751516e-06,
"loss": 0.0043,
"step": 3170
},
{
"epoch": 2.2617354196301562,
"grad_norm": 0.6640226689598476,
"learning_rate": 1.7373682447685624e-06,
"loss": 0.0051,
"step": 3180
},
{
"epoch": 2.268847795163585,
"grad_norm": 0.4186199899663268,
"learning_rate": 1.706123747538196e-06,
"loss": 0.0053,
"step": 3190
},
{
"epoch": 2.275960170697013,
"grad_norm": 0.2394022704349156,
"learning_rate": 1.6751048570832184e-06,
"loss": 0.0058,
"step": 3200
},
{
"epoch": 2.283072546230441,
"grad_norm": 0.4686051788122248,
"learning_rate": 1.6443136979739855e-06,
"loss": 0.0041,
"step": 3210
},
{
"epoch": 2.2901849217638692,
"grad_norm": 0.3836913082140705,
"learning_rate": 1.6137523791829007e-06,
"loss": 0.0041,
"step": 3220
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.16170960660071848,
"learning_rate": 1.5834229939399637e-06,
"loss": 0.0061,
"step": 3230
},
{
"epoch": 2.3044096728307255,
"grad_norm": 0.18200293554129154,
"learning_rate": 1.5533276195893987e-06,
"loss": 0.005,
"step": 3240
},
{
"epoch": 2.3115220483641536,
"grad_norm": 0.23940834550194912,
"learning_rate": 1.5234683174473669e-06,
"loss": 0.0046,
"step": 3250
},
{
"epoch": 2.318634423897582,
"grad_norm": 0.24406884268015908,
"learning_rate": 1.493847132660789e-06,
"loss": 0.0048,
"step": 3260
},
{
"epoch": 2.32574679943101,
"grad_norm": 0.26509665588494075,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0059,
"step": 3270
},
{
"epoch": 2.332859174964438,
"grad_norm": 0.475162427962314,
"learning_rate": 1.435327214056103e-06,
"loss": 0.0051,
"step": 3280
},
{
"epoch": 2.339971550497866,
"grad_norm": 0.30098307863573437,
"learning_rate": 1.406432488430508e-06,
"loss": 0.005,
"step": 3290
},
{
"epoch": 2.3470839260312943,
"grad_norm": 0.20931527074041337,
"learning_rate": 1.3777838962708602e-06,
"loss": 0.0046,
"step": 3300
},
{
"epoch": 2.3541963015647225,
"grad_norm": 0.22874545442845454,
"learning_rate": 1.3493833997991745e-06,
"loss": 0.0041,
"step": 3310
},
{
"epoch": 2.3613086770981506,
"grad_norm": 1.4075066556310636,
"learning_rate": 1.3212329442446985e-06,
"loss": 0.0058,
"step": 3320
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.8975809394063826,
"learning_rate": 1.2933344577106822e-06,
"loss": 0.005,
"step": 3330
},
{
"epoch": 2.3755334281650073,
"grad_norm": 0.4148231917835156,
"learning_rate": 1.2656898510423122e-06,
"loss": 0.0052,
"step": 3340
},
{
"epoch": 2.382645803698435,
"grad_norm": 0.11195947556497522,
"learning_rate": 1.2383010176958372e-06,
"loss": 0.0048,
"step": 3350
},
{
"epoch": 2.3897581792318636,
"grad_norm": 0.26661464526041523,
"learning_rate": 1.2111698336088717e-06,
"loss": 0.0054,
"step": 3360
},
{
"epoch": 2.3968705547652918,
"grad_norm": 1.370337214123434,
"learning_rate": 1.1842981570719237e-06,
"loss": 0.0048,
"step": 3370
},
{
"epoch": 2.40398293029872,
"grad_norm": 0.26207335784094793,
"learning_rate": 1.157687828601094e-06,
"loss": 0.0047,
"step": 3380
},
{
"epoch": 2.411095305832148,
"grad_norm": 0.1752729581013448,
"learning_rate": 1.1313406708120327e-06,
"loss": 0.0039,
"step": 3390
},
{
"epoch": 2.418207681365576,
"grad_norm": 0.2700693202909992,
"learning_rate": 1.1052584882950896e-06,
"loss": 0.0043,
"step": 3400
},
{
"epoch": 2.4253200568990043,
"grad_norm": 1.3868146763763338,
"learning_rate": 1.0794430674917262e-06,
"loss": 0.0036,
"step": 3410
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.5361814766521613,
"learning_rate": 1.0538961765721429e-06,
"loss": 0.0049,
"step": 3420
},
{
"epoch": 2.4395448079658606,
"grad_norm": 0.2257026998579445,
"learning_rate": 1.0286195653141822e-06,
"loss": 0.0044,
"step": 3430
},
{
"epoch": 2.4466571834992887,
"grad_norm": 0.28821507076356073,
"learning_rate": 1.0036149649834786e-06,
"loss": 0.0041,
"step": 3440
},
{
"epoch": 2.453769559032717,
"grad_norm": 0.20288584422361874,
"learning_rate": 9.788840882148803e-07,
"loss": 0.0044,
"step": 3450
},
{
"epoch": 2.460881934566145,
"grad_norm": 0.2961984403628135,
"learning_rate": 9.544286288951393e-07,
"loss": 0.0035,
"step": 3460
},
{
"epoch": 2.467994310099573,
"grad_norm": 0.34744720835705395,
"learning_rate": 9.302502620469073e-07,
"loss": 0.0046,
"step": 3470
},
{
"epoch": 2.4751066856330013,
"grad_norm": 0.21259678229337609,
"learning_rate": 9.063506437139901e-07,
"loss": 0.0039,
"step": 3480
},
{
"epoch": 2.4822190611664294,
"grad_norm": 0.18018604906671717,
"learning_rate": 8.827314108479357e-07,
"loss": 0.0039,
"step": 3490
},
{
"epoch": 2.4893314366998576,
"grad_norm": 0.2326353801367776,
"learning_rate": 8.593941811959078e-07,
"loss": 0.0038,
"step": 3500
},
{
"epoch": 2.496443812233286,
"grad_norm": 0.1741953097808654,
"learning_rate": 8.363405531898833e-07,
"loss": 0.004,
"step": 3510
},
{
"epoch": 2.503556187766714,
"grad_norm": 0.25494195054679114,
"learning_rate": 8.135721058371681e-07,
"loss": 0.0038,
"step": 3520
},
{
"epoch": 2.5106685633001424,
"grad_norm": 0.18154830083734025,
"learning_rate": 7.910903986122537e-07,
"loss": 0.0024,
"step": 3530
},
{
"epoch": 2.5177809388335706,
"grad_norm": 0.21685694657911553,
"learning_rate": 7.688969713499983e-07,
"loss": 0.0037,
"step": 3540
},
{
"epoch": 2.5248933143669987,
"grad_norm": 0.22019585417284554,
"learning_rate": 7.469933441401606e-07,
"loss": 0.0041,
"step": 3550
},
{
"epoch": 2.532005689900427,
"grad_norm": 0.12503238568854871,
"learning_rate": 7.253810172232867e-07,
"loss": 0.0033,
"step": 3560
},
{
"epoch": 2.539118065433855,
"grad_norm": 0.20164607745958418,
"learning_rate": 7.040614708879489e-07,
"loss": 0.0036,
"step": 3570
},
{
"epoch": 2.546230440967283,
"grad_norm": 1.1353152225303087,
"learning_rate": 6.830361653693673e-07,
"loss": 0.0037,
"step": 3580
},
{
"epoch": 2.5533428165007113,
"grad_norm": 0.25954463871540623,
"learning_rate": 6.623065407493801e-07,
"loss": 0.0037,
"step": 3590
},
{
"epoch": 2.5604551920341394,
"grad_norm": 0.17926285547151083,
"learning_rate": 6.418740168578208e-07,
"loss": 0.0033,
"step": 3600
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.15055900043010326,
"learning_rate": 6.217399931752627e-07,
"loss": 0.0035,
"step": 3610
},
{
"epoch": 2.5746799431009957,
"grad_norm": 0.18791412690022857,
"learning_rate": 6.019058487371687e-07,
"loss": 0.0033,
"step": 3620
},
{
"epoch": 2.581792318634424,
"grad_norm": 0.11425352927323033,
"learning_rate": 5.82372942039432e-07,
"loss": 0.0042,
"step": 3630
},
{
"epoch": 2.588904694167852,
"grad_norm": 0.3824468109713527,
"learning_rate": 5.631426109453364e-07,
"loss": 0.004,
"step": 3640
},
{
"epoch": 2.59601706970128,
"grad_norm": 0.15674551917384533,
"learning_rate": 5.44216172593916e-07,
"loss": 0.0037,
"step": 3650
},
{
"epoch": 2.6031294452347082,
"grad_norm": 0.2813443939041295,
"learning_rate": 5.255949233097451e-07,
"loss": 0.0037,
"step": 3660
},
{
"epoch": 2.6102418207681364,
"grad_norm": 0.18387812179204338,
"learning_rate": 5.072801385141429e-07,
"loss": 0.0036,
"step": 3670
},
{
"epoch": 2.617354196301565,
"grad_norm": 0.2156232450844716,
"learning_rate": 4.89273072637827e-07,
"loss": 0.0032,
"step": 3680
},
{
"epoch": 2.6244665718349927,
"grad_norm": 0.12440006889757067,
"learning_rate": 4.7157495903498105e-07,
"loss": 0.0033,
"step": 3690
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.17119738387604694,
"learning_rate": 4.541870098987911e-07,
"loss": 0.0037,
"step": 3700
},
{
"epoch": 2.6386913229018494,
"grad_norm": 0.1919029589645196,
"learning_rate": 4.371104161784073e-07,
"loss": 0.0043,
"step": 3710
},
{
"epoch": 2.6458036984352775,
"grad_norm": 0.23702321107949612,
"learning_rate": 4.2034634749738623e-07,
"loss": 0.0032,
"step": 3720
},
{
"epoch": 2.6529160739687057,
"grad_norm": 0.15697463651810808,
"learning_rate": 4.038959520735658e-07,
"loss": 0.004,
"step": 3730
},
{
"epoch": 2.660028449502134,
"grad_norm": 0.09125511769820786,
"learning_rate": 3.8776035664043033e-07,
"loss": 0.0037,
"step": 3740
},
{
"epoch": 2.667140825035562,
"grad_norm": 0.17251531382162535,
"learning_rate": 3.719406663699349e-07,
"loss": 0.0042,
"step": 3750
},
{
"epoch": 2.67425320056899,
"grad_norm": 0.22855108481819528,
"learning_rate": 3.564379647968064e-07,
"loss": 0.0035,
"step": 3760
},
{
"epoch": 2.681365576102418,
"grad_norm": 0.1174830009274809,
"learning_rate": 3.4125331374433414e-07,
"loss": 0.0038,
"step": 3770
},
{
"epoch": 2.6884779516358464,
"grad_norm": 0.13196615257533414,
"learning_rate": 3.2638775325163517e-07,
"loss": 0.0029,
"step": 3780
},
{
"epoch": 2.6955903271692745,
"grad_norm": 0.16568974053657992,
"learning_rate": 3.1184230150243025e-07,
"loss": 0.0028,
"step": 3790
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.16710801427024333,
"learning_rate": 2.9761795475529375e-07,
"loss": 0.0031,
"step": 3800
},
{
"epoch": 2.7098150782361308,
"grad_norm": 0.1500941244877554,
"learning_rate": 2.8371568727542486e-07,
"loss": 0.0036,
"step": 3810
},
{
"epoch": 2.716927453769559,
"grad_norm": 0.19067715703132965,
"learning_rate": 2.7013645126791446e-07,
"loss": 0.003,
"step": 3820
},
{
"epoch": 2.724039829302987,
"grad_norm": 0.15140629484800455,
"learning_rate": 2.5688117681252677e-07,
"loss": 0.0033,
"step": 3830
},
{
"epoch": 2.731152204836415,
"grad_norm": 0.17248672811681454,
"learning_rate": 2.439507717999945e-07,
"loss": 0.003,
"step": 3840
},
{
"epoch": 2.7382645803698438,
"grad_norm": 0.1782041915340656,
"learning_rate": 2.3134612186983817e-07,
"loss": 0.0036,
"step": 3850
},
{
"epoch": 2.7453769559032715,
"grad_norm": 0.3756662666979349,
"learning_rate": 2.1906809034970057e-07,
"loss": 0.0035,
"step": 3860
},
{
"epoch": 2.7524893314367,
"grad_norm": 0.2078367449157142,
"learning_rate": 2.0711751819622038e-07,
"loss": 0.003,
"step": 3870
},
{
"epoch": 2.759601706970128,
"grad_norm": 0.29097101889612503,
"learning_rate": 1.954952239374286e-07,
"loss": 0.0042,
"step": 3880
},
{
"epoch": 2.7667140825035563,
"grad_norm": 0.25963382600527757,
"learning_rate": 1.8420200361669137e-07,
"loss": 0.0036,
"step": 3890
},
{
"epoch": 2.7738264580369845,
"grad_norm": 0.1769468548637021,
"learning_rate": 1.732386307381767e-07,
"loss": 0.0031,
"step": 3900
},
{
"epoch": 2.7809388335704126,
"grad_norm": 0.15613753091399799,
"learning_rate": 1.6260585621388604e-07,
"loss": 0.0034,
"step": 3910
},
{
"epoch": 2.7880512091038407,
"grad_norm": 0.1956972505836662,
"learning_rate": 1.523044083122138e-07,
"loss": 0.0044,
"step": 3920
},
{
"epoch": 2.795163584637269,
"grad_norm": 0.23888957121934698,
"learning_rate": 1.4233499260807194e-07,
"loss": 0.0045,
"step": 3930
},
{
"epoch": 2.802275960170697,
"grad_norm": 0.20491135818198022,
"learning_rate": 1.326982919345582e-07,
"loss": 0.0034,
"step": 3940
},
{
"epoch": 2.809388335704125,
"grad_norm": 0.16904575085328288,
"learning_rate": 1.2339496633619218e-07,
"loss": 0.0031,
"step": 3950
},
{
"epoch": 2.8165007112375533,
"grad_norm": 0.1259555532924781,
"learning_rate": 1.1442565302370146e-07,
"loss": 0.0029,
"step": 3960
},
{
"epoch": 2.8236130867709814,
"grad_norm": 0.15248025417153557,
"learning_rate": 1.0579096633038411e-07,
"loss": 0.0037,
"step": 3970
},
{
"epoch": 2.8307254623044096,
"grad_norm": 0.12827921470857406,
"learning_rate": 9.749149767002197e-08,
"loss": 0.0032,
"step": 3980
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.22583248679646706,
"learning_rate": 8.952781549638412e-08,
"loss": 0.004,
"step": 3990
},
{
"epoch": 2.844950213371266,
"grad_norm": 0.18816430934886938,
"learning_rate": 8.190046526428241e-08,
"loss": 0.0031,
"step": 4000
},
{
"epoch": 2.852062588904694,
"grad_norm": 0.23202372767680643,
"learning_rate": 7.460996939221643e-08,
"loss": 0.0039,
"step": 4010
},
{
"epoch": 2.8591749644381226,
"grad_norm": 0.1623071297031127,
"learning_rate": 6.765682722659151e-08,
"loss": 0.0039,
"step": 4020
},
{
"epoch": 2.8662873399715503,
"grad_norm": 0.2130255450940709,
"learning_rate": 6.104151500751609e-08,
"loss": 0.0028,
"step": 4030
},
{
"epoch": 2.873399715504979,
"grad_norm": 0.14259971568207008,
"learning_rate": 5.476448583618288e-08,
"loss": 0.0039,
"step": 4040
},
{
"epoch": 2.8805120910384066,
"grad_norm": 0.18255687916100874,
"learning_rate": 4.8826169643832464e-08,
"loss": 0.0028,
"step": 4050
},
{
"epoch": 2.887624466571835,
"grad_norm": 0.2053446140861889,
"learning_rate": 4.322697316231361e-08,
"loss": 0.0039,
"step": 4060
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.1604377238062154,
"learning_rate": 3.796727989621385e-08,
"loss": 0.0027,
"step": 4070
},
{
"epoch": 2.9018492176386914,
"grad_norm": 0.1963367249030138,
"learning_rate": 3.304745009660326e-08,
"loss": 0.0033,
"step": 4080
},
{
"epoch": 2.9089615931721196,
"grad_norm": 0.3357461992161217,
"learning_rate": 2.8467820736350903e-08,
"loss": 0.0031,
"step": 4090
},
{
"epoch": 2.9160739687055477,
"grad_norm": 0.11268055230811629,
"learning_rate": 2.422870548705103e-08,
"loss": 0.0032,
"step": 4100
},
{
"epoch": 2.923186344238976,
"grad_norm": 0.2544008002680536,
"learning_rate": 2.0330394697534726e-08,
"loss": 0.0037,
"step": 4110
},
{
"epoch": 2.930298719772404,
"grad_norm": 0.13380581381300227,
"learning_rate": 1.677315537398583e-08,
"loss": 0.0038,
"step": 4120
},
{
"epoch": 2.937411095305832,
"grad_norm": 0.18134475982283993,
"learning_rate": 1.355723116165164e-08,
"loss": 0.0032,
"step": 4130
},
{
"epoch": 2.9445234708392602,
"grad_norm": 0.26101935700080225,
"learning_rate": 1.0682842328154086e-08,
"loss": 0.0033,
"step": 4140
},
{
"epoch": 2.9516358463726884,
"grad_norm": 0.19538728836785646,
"learning_rate": 8.150185748405092e-09,
"loss": 0.0037,
"step": 4150
},
{
"epoch": 2.9587482219061165,
"grad_norm": 0.22029377776143994,
"learning_rate": 5.959434891121274e-09,
"loss": 0.0034,
"step": 4160
},
{
"epoch": 2.9658605974395447,
"grad_norm": 0.2119562852212782,
"learning_rate": 4.110739806940656e-09,
"loss": 0.003,
"step": 4170
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.21084982886316375,
"learning_rate": 2.604227118148117e-09,
"loss": 0.0027,
"step": 4180
},
{
"epoch": 2.9800853485064014,
"grad_norm": 0.30515012873944525,
"learning_rate": 1.4400000100017741e-09,
"loss": 0.0033,
"step": 4190
},
{
"epoch": 2.987197724039829,
"grad_norm": 0.15412371556633805,
"learning_rate": 6.181382236641887e-10,
"loss": 0.0028,
"step": 4200
},
{
"epoch": 2.9943100995732577,
"grad_norm": 0.19260652338114095,
"learning_rate": 1.3869805074284704e-10,
"loss": 0.0033,
"step": 4210
},
{
"epoch": 3.0,
"step": 4218,
"total_flos": 187524427284480.0,
"train_loss": 0.23043089039047898,
"train_runtime": 21192.9138,
"train_samples_per_second": 12.735,
"train_steps_per_second": 0.199
}
],
"logging_steps": 10,
"max_steps": 4218,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 187524427284480.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}