qwen-debug-unlearn / trainer_state.json
jackysnake's picture
Upload folder using huggingface_hub
ddb8ccc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4218,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007112375533428165,
"grad_norm": 45.94488445688017,
"learning_rate": 2.132701421800948e-07,
"loss": 4.9235,
"step": 10
},
{
"epoch": 0.01422475106685633,
"grad_norm": 56.45310643883483,
"learning_rate": 4.502369668246446e-07,
"loss": 4.7616,
"step": 20
},
{
"epoch": 0.021337126600284494,
"grad_norm": 47.57072569736425,
"learning_rate": 6.872037914691944e-07,
"loss": 4.0518,
"step": 30
},
{
"epoch": 0.02844950213371266,
"grad_norm": 14.9615219454182,
"learning_rate": 9.241706161137441e-07,
"loss": 3.1168,
"step": 40
},
{
"epoch": 0.03556187766714083,
"grad_norm": 9.209691028948875,
"learning_rate": 1.161137440758294e-06,
"loss": 2.408,
"step": 50
},
{
"epoch": 0.04267425320056899,
"grad_norm": 3.738856271681981,
"learning_rate": 1.3981042654028437e-06,
"loss": 2.0996,
"step": 60
},
{
"epoch": 0.049786628733997154,
"grad_norm": 4.299210545328982,
"learning_rate": 1.6350710900473934e-06,
"loss": 1.961,
"step": 70
},
{
"epoch": 0.05689900426742532,
"grad_norm": 4.288339811445908,
"learning_rate": 1.8720379146919433e-06,
"loss": 1.8454,
"step": 80
},
{
"epoch": 0.06401137980085349,
"grad_norm": 4.487588443815648,
"learning_rate": 2.109004739336493e-06,
"loss": 1.8,
"step": 90
},
{
"epoch": 0.07112375533428165,
"grad_norm": 5.903658522691362,
"learning_rate": 2.345971563981043e-06,
"loss": 1.7189,
"step": 100
},
{
"epoch": 0.07823613086770982,
"grad_norm": 6.538803049950138,
"learning_rate": 2.5829383886255925e-06,
"loss": 1.6861,
"step": 110
},
{
"epoch": 0.08534850640113797,
"grad_norm": 6.420212036240461,
"learning_rate": 2.8199052132701426e-06,
"loss": 1.6933,
"step": 120
},
{
"epoch": 0.09246088193456614,
"grad_norm": 6.08601994925446,
"learning_rate": 3.0568720379146923e-06,
"loss": 1.6477,
"step": 130
},
{
"epoch": 0.09957325746799431,
"grad_norm": 6.641158507404538,
"learning_rate": 3.293838862559242e-06,
"loss": 1.5837,
"step": 140
},
{
"epoch": 0.10668563300142248,
"grad_norm": 5.6671416215198445,
"learning_rate": 3.5308056872037916e-06,
"loss": 1.553,
"step": 150
},
{
"epoch": 0.11379800853485064,
"grad_norm": 4.895576620125158,
"learning_rate": 3.7677725118483417e-06,
"loss": 1.601,
"step": 160
},
{
"epoch": 0.12091038406827881,
"grad_norm": 5.00629870941093,
"learning_rate": 4.004739336492891e-06,
"loss": 1.4953,
"step": 170
},
{
"epoch": 0.12802275960170698,
"grad_norm": 4.098161366916081,
"learning_rate": 4.2417061611374415e-06,
"loss": 1.4986,
"step": 180
},
{
"epoch": 0.13513513513513514,
"grad_norm": 4.279942094132115,
"learning_rate": 4.478672985781991e-06,
"loss": 1.4177,
"step": 190
},
{
"epoch": 0.1422475106685633,
"grad_norm": 2.9957264584301506,
"learning_rate": 4.715639810426541e-06,
"loss": 1.4234,
"step": 200
},
{
"epoch": 0.14935988620199148,
"grad_norm": 2.960846105003115,
"learning_rate": 4.952606635071091e-06,
"loss": 1.4034,
"step": 210
},
{
"epoch": 0.15647226173541964,
"grad_norm": 2.8142395090714207,
"learning_rate": 5.18957345971564e-06,
"loss": 1.4172,
"step": 220
},
{
"epoch": 0.16358463726884778,
"grad_norm": 3.1953820486960938,
"learning_rate": 5.42654028436019e-06,
"loss": 1.3695,
"step": 230
},
{
"epoch": 0.17069701280227595,
"grad_norm": 3.0329786581569813,
"learning_rate": 5.66350710900474e-06,
"loss": 1.3815,
"step": 240
},
{
"epoch": 0.17780938833570412,
"grad_norm": 3.023917167954777,
"learning_rate": 5.90047393364929e-06,
"loss": 1.3494,
"step": 250
},
{
"epoch": 0.18492176386913228,
"grad_norm": 3.3061969408501186,
"learning_rate": 6.137440758293839e-06,
"loss": 1.351,
"step": 260
},
{
"epoch": 0.19203413940256045,
"grad_norm": 3.0703691360984116,
"learning_rate": 6.374407582938389e-06,
"loss": 1.3007,
"step": 270
},
{
"epoch": 0.19914651493598862,
"grad_norm": 2.6510030082143072,
"learning_rate": 6.611374407582939e-06,
"loss": 1.2318,
"step": 280
},
{
"epoch": 0.20625889046941678,
"grad_norm": 2.781634197302321,
"learning_rate": 6.848341232227489e-06,
"loss": 1.2452,
"step": 290
},
{
"epoch": 0.21337126600284495,
"grad_norm": 2.889926592158047,
"learning_rate": 7.085308056872039e-06,
"loss": 1.2299,
"step": 300
},
{
"epoch": 0.22048364153627312,
"grad_norm": 2.9460513709926546,
"learning_rate": 7.322274881516588e-06,
"loss": 1.2481,
"step": 310
},
{
"epoch": 0.22759601706970128,
"grad_norm": 3.117250263470296,
"learning_rate": 7.559241706161138e-06,
"loss": 1.1874,
"step": 320
},
{
"epoch": 0.23470839260312945,
"grad_norm": 3.1068660585891443,
"learning_rate": 7.796208530805689e-06,
"loss": 1.242,
"step": 330
},
{
"epoch": 0.24182076813655762,
"grad_norm": 3.2303235755610458,
"learning_rate": 8.033175355450237e-06,
"loss": 1.1656,
"step": 340
},
{
"epoch": 0.24893314366998578,
"grad_norm": 3.380471682074544,
"learning_rate": 8.270142180094787e-06,
"loss": 1.1626,
"step": 350
},
{
"epoch": 0.25604551920341395,
"grad_norm": 3.0003799025494455,
"learning_rate": 8.507109004739337e-06,
"loss": 1.1136,
"step": 360
},
{
"epoch": 0.2631578947368421,
"grad_norm": 3.3507131315688037,
"learning_rate": 8.744075829383887e-06,
"loss": 1.109,
"step": 370
},
{
"epoch": 0.2702702702702703,
"grad_norm": 3.286430938699654,
"learning_rate": 8.981042654028437e-06,
"loss": 1.0926,
"step": 380
},
{
"epoch": 0.2773826458036984,
"grad_norm": 3.543025306575121,
"learning_rate": 9.218009478672988e-06,
"loss": 0.9856,
"step": 390
},
{
"epoch": 0.2844950213371266,
"grad_norm": 2.9641151250477,
"learning_rate": 9.454976303317538e-06,
"loss": 1.0438,
"step": 400
},
{
"epoch": 0.29160739687055476,
"grad_norm": 3.0879210891464175,
"learning_rate": 9.691943127962086e-06,
"loss": 0.9834,
"step": 410
},
{
"epoch": 0.29871977240398295,
"grad_norm": 3.5828764512704274,
"learning_rate": 9.928909952606636e-06,
"loss": 1.0355,
"step": 420
},
{
"epoch": 0.3058321479374111,
"grad_norm": 3.0432346994349944,
"learning_rate": 9.99991609608766e-06,
"loss": 0.9973,
"step": 430
},
{
"epoch": 0.3129445234708393,
"grad_norm": 3.6852442122283384,
"learning_rate": 9.999505144928566e-06,
"loss": 1.0118,
"step": 440
},
{
"epoch": 0.3200568990042674,
"grad_norm": 3.4571934113589893,
"learning_rate": 9.998751763712045e-06,
"loss": 0.915,
"step": 450
},
{
"epoch": 0.32716927453769556,
"grad_norm": 3.3733896978659215,
"learning_rate": 9.997656004039284e-06,
"loss": 0.8872,
"step": 460
},
{
"epoch": 0.33428165007112376,
"grad_norm": 3.1986482463279344,
"learning_rate": 9.99621794096192e-06,
"loss": 0.9233,
"step": 470
},
{
"epoch": 0.3413940256045519,
"grad_norm": 3.3781480125146217,
"learning_rate": 9.994437672976904e-06,
"loss": 0.8156,
"step": 480
},
{
"epoch": 0.3485064011379801,
"grad_norm": 3.6561286544224516,
"learning_rate": 9.99231532201976e-06,
"loss": 0.8749,
"step": 490
},
{
"epoch": 0.35561877667140823,
"grad_norm": 4.142627644307138,
"learning_rate": 9.989851033456224e-06,
"loss": 0.8598,
"step": 500
},
{
"epoch": 0.3627311522048364,
"grad_norm": 3.7494771233239828,
"learning_rate": 9.987044976072298e-06,
"loss": 0.8118,
"step": 510
},
{
"epoch": 0.36984352773826457,
"grad_norm": 3.6547956812812123,
"learning_rate": 9.983897342062681e-06,
"loss": 0.8227,
"step": 520
},
{
"epoch": 0.37695590327169276,
"grad_norm": 3.679890083139226,
"learning_rate": 9.98040834701761e-06,
"loss": 0.8132,
"step": 530
},
{
"epoch": 0.3840682788051209,
"grad_norm": 3.252191257909053,
"learning_rate": 9.97657822990809e-06,
"loss": 0.7806,
"step": 540
},
{
"epoch": 0.3911806543385491,
"grad_norm": 3.614922960561001,
"learning_rate": 9.972407253069527e-06,
"loss": 0.8095,
"step": 550
},
{
"epoch": 0.39829302987197723,
"grad_norm": 3.793537378483368,
"learning_rate": 9.967895702183767e-06,
"loss": 0.7911,
"step": 560
},
{
"epoch": 0.40540540540540543,
"grad_norm": 3.65980827340659,
"learning_rate": 9.963043886259518e-06,
"loss": 0.7712,
"step": 570
},
{
"epoch": 0.41251778093883357,
"grad_norm": 3.5164539759645037,
"learning_rate": 9.957852137611187e-06,
"loss": 0.7634,
"step": 580
},
{
"epoch": 0.41963015647226176,
"grad_norm": 3.3236842648189633,
"learning_rate": 9.952320811836129e-06,
"loss": 0.6903,
"step": 590
},
{
"epoch": 0.4267425320056899,
"grad_norm": 3.294343434220933,
"learning_rate": 9.94645028779028e-06,
"loss": 0.7238,
"step": 600
},
{
"epoch": 0.43385490753911804,
"grad_norm": 3.4974393759929208,
"learning_rate": 9.94024096756221e-06,
"loss": 0.694,
"step": 610
},
{
"epoch": 0.44096728307254623,
"grad_norm": 4.433758888856019,
"learning_rate": 9.933693276445588e-06,
"loss": 0.7057,
"step": 620
},
{
"epoch": 0.4480796586059744,
"grad_norm": 3.3896425434092503,
"learning_rate": 9.92680766291005e-06,
"loss": 0.7001,
"step": 630
},
{
"epoch": 0.45519203413940257,
"grad_norm": 3.2995707993625834,
"learning_rate": 9.91958459857048e-06,
"loss": 0.6451,
"step": 640
},
{
"epoch": 0.4623044096728307,
"grad_norm": 3.5589453987217805,
"learning_rate": 9.912024578154706e-06,
"loss": 0.6539,
"step": 650
},
{
"epoch": 0.4694167852062589,
"grad_norm": 3.457156793924661,
"learning_rate": 9.904128119469625e-06,
"loss": 0.6383,
"step": 660
},
{
"epoch": 0.47652916073968704,
"grad_norm": 3.791061357289613,
"learning_rate": 9.895895763365722e-06,
"loss": 0.6319,
"step": 670
},
{
"epoch": 0.48364153627311524,
"grad_norm": 3.7253719001786307,
"learning_rate": 9.88732807370004e-06,
"loss": 0.589,
"step": 680
},
{
"epoch": 0.4907539118065434,
"grad_norm": 3.8753257386340167,
"learning_rate": 9.878425637297549e-06,
"loss": 0.5236,
"step": 690
},
{
"epoch": 0.49786628733997157,
"grad_norm": 3.810036186400155,
"learning_rate": 9.869189063910959e-06,
"loss": 0.524,
"step": 700
},
{
"epoch": 0.5049786628733998,
"grad_norm": 4.2180281642967365,
"learning_rate": 9.859618986178953e-06,
"loss": 0.5336,
"step": 710
},
{
"epoch": 0.5120910384068279,
"grad_norm": 3.938273345051735,
"learning_rate": 9.84971605958286e-06,
"loss": 0.5202,
"step": 720
},
{
"epoch": 0.519203413940256,
"grad_norm": 3.5712127017141397,
"learning_rate": 9.839480962401753e-06,
"loss": 0.4938,
"step": 730
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.383580945232286,
"learning_rate": 9.828914395665996e-06,
"loss": 0.4503,
"step": 740
},
{
"epoch": 0.5334281650071123,
"grad_norm": 3.850151538007975,
"learning_rate": 9.818017083109233e-06,
"loss": 0.5067,
"step": 750
},
{
"epoch": 0.5405405405405406,
"grad_norm": 3.579242735091459,
"learning_rate": 9.8067897711188e-06,
"loss": 0.4296,
"step": 760
},
{
"epoch": 0.5476529160739687,
"grad_norm": 3.33637898169204,
"learning_rate": 9.795233228684631e-06,
"loss": 0.422,
"step": 770
},
{
"epoch": 0.5547652916073968,
"grad_norm": 3.3180173487560998,
"learning_rate": 9.783348247346558e-06,
"loss": 0.4352,
"step": 780
},
{
"epoch": 0.561877667140825,
"grad_norm": 3.3074859328364172,
"learning_rate": 9.771135641140117e-06,
"loss": 0.3788,
"step": 790
},
{
"epoch": 0.5689900426742532,
"grad_norm": 3.935128904527344,
"learning_rate": 9.758596246540782e-06,
"loss": 0.4512,
"step": 800
},
{
"epoch": 0.5761024182076814,
"grad_norm": 3.130800872692149,
"learning_rate": 9.74573092240668e-06,
"loss": 0.4286,
"step": 810
},
{
"epoch": 0.5832147937411095,
"grad_norm": 3.4818017716980076,
"learning_rate": 9.732540549919758e-06,
"loss": 0.3976,
"step": 820
},
{
"epoch": 0.5903271692745377,
"grad_norm": 3.7176422056718708,
"learning_rate": 9.719026032525432e-06,
"loss": 0.3845,
"step": 830
},
{
"epoch": 0.5974395448079659,
"grad_norm": 4.0428367587373115,
"learning_rate": 9.70518829587071e-06,
"loss": 0.3761,
"step": 840
},
{
"epoch": 0.604551920341394,
"grad_norm": 3.32333703731893,
"learning_rate": 9.691028287740783e-06,
"loss": 0.3663,
"step": 850
},
{
"epoch": 0.6116642958748222,
"grad_norm": 4.055447477108677,
"learning_rate": 9.67654697799412e-06,
"loss": 0.3683,
"step": 860
},
{
"epoch": 0.6187766714082503,
"grad_norm": 2.801736293850873,
"learning_rate": 9.661745358496033e-06,
"loss": 0.3302,
"step": 870
},
{
"epoch": 0.6258890469416786,
"grad_norm": 2.9454979478833576,
"learning_rate": 9.64662444305074e-06,
"loss": 0.3714,
"step": 880
},
{
"epoch": 0.6330014224751067,
"grad_norm": 3.933969741535959,
"learning_rate": 9.631185267331937e-06,
"loss": 0.3214,
"step": 890
},
{
"epoch": 0.6401137980085349,
"grad_norm": 3.0707180797561398,
"learning_rate": 9.615428888811842e-06,
"loss": 0.3151,
"step": 900
},
{
"epoch": 0.647226173541963,
"grad_norm": 3.6006782352295095,
"learning_rate": 9.59935638668879e-06,
"loss": 0.3134,
"step": 910
},
{
"epoch": 0.6543385490753911,
"grad_norm": 4.528381319074012,
"learning_rate": 9.582968861813295e-06,
"loss": 0.2826,
"step": 920
},
{
"epoch": 0.6614509246088194,
"grad_norm": 3.084970600037643,
"learning_rate": 9.566267436612662e-06,
"loss": 0.3272,
"step": 930
},
{
"epoch": 0.6685633001422475,
"grad_norm": 3.1926454881670008,
"learning_rate": 9.549253255014105e-06,
"loss": 0.2838,
"step": 940
},
{
"epoch": 0.6756756756756757,
"grad_norm": 3.3232334022391083,
"learning_rate": 9.531927482366398e-06,
"loss": 0.2676,
"step": 950
},
{
"epoch": 0.6827880512091038,
"grad_norm": 3.373450413027547,
"learning_rate": 9.514291305360053e-06,
"loss": 0.2615,
"step": 960
},
{
"epoch": 0.689900426742532,
"grad_norm": 3.298511219641843,
"learning_rate": 9.496345931946039e-06,
"loss": 0.2232,
"step": 970
},
{
"epoch": 0.6970128022759602,
"grad_norm": 2.8709213001564726,
"learning_rate": 9.47809259125306e-06,
"loss": 0.2628,
"step": 980
},
{
"epoch": 0.7041251778093883,
"grad_norm": 3.0027633203506,
"learning_rate": 9.459532533503347e-06,
"loss": 0.2404,
"step": 990
},
{
"epoch": 0.7112375533428165,
"grad_norm": 3.0886670354052823,
"learning_rate": 9.440667029927043e-06,
"loss": 0.2259,
"step": 1000
},
{
"epoch": 0.7183499288762447,
"grad_norm": 3.413560155663082,
"learning_rate": 9.421497372675133e-06,
"loss": 0.208,
"step": 1010
},
{
"epoch": 0.7254623044096729,
"grad_norm": 2.26900305381711,
"learning_rate": 9.402024874730928e-06,
"loss": 0.2277,
"step": 1020
},
{
"epoch": 0.732574679943101,
"grad_norm": 3.5894430284698315,
"learning_rate": 9.382250869820146e-06,
"loss": 0.1926,
"step": 1030
},
{
"epoch": 0.7396870554765291,
"grad_norm": 3.267737905170995,
"learning_rate": 9.36217671231956e-06,
"loss": 0.2299,
"step": 1040
},
{
"epoch": 0.7467994310099573,
"grad_norm": 2.7538943048992737,
"learning_rate": 9.341803777164228e-06,
"loss": 0.1708,
"step": 1050
},
{
"epoch": 0.7539118065433855,
"grad_norm": 3.867540040555883,
"learning_rate": 9.321133459753322e-06,
"loss": 0.2072,
"step": 1060
},
{
"epoch": 0.7610241820768137,
"grad_norm": 2.3384449104832226,
"learning_rate": 9.300167175854564e-06,
"loss": 0.1875,
"step": 1070
},
{
"epoch": 0.7681365576102418,
"grad_norm": 3.6436777076779348,
"learning_rate": 9.278906361507238e-06,
"loss": 0.173,
"step": 1080
},
{
"epoch": 0.7752489331436699,
"grad_norm": 2.623342004246653,
"learning_rate": 9.257352472923842e-06,
"loss": 0.1489,
"step": 1090
},
{
"epoch": 0.7823613086770982,
"grad_norm": 2.9293688128652606,
"learning_rate": 9.235506986390346e-06,
"loss": 0.1423,
"step": 1100
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.1229986788680653,
"learning_rate": 9.213371398165077e-06,
"loss": 0.1564,
"step": 1110
},
{
"epoch": 0.7965860597439545,
"grad_norm": 3.5638406658438826,
"learning_rate": 9.190947224376238e-06,
"loss": 0.1872,
"step": 1120
},
{
"epoch": 0.8036984352773826,
"grad_norm": 3.754826640146973,
"learning_rate": 9.168236000918063e-06,
"loss": 0.1483,
"step": 1130
},
{
"epoch": 0.8108108108108109,
"grad_norm": 2.494125324383473,
"learning_rate": 9.145239283345618e-06,
"loss": 0.1272,
"step": 1140
},
{
"epoch": 0.817923186344239,
"grad_norm": 2.1750463421723003,
"learning_rate": 9.121958646768251e-06,
"loss": 0.1361,
"step": 1150
},
{
"epoch": 0.8250355618776671,
"grad_norm": 2.6835693031385035,
"learning_rate": 9.09839568574173e-06,
"loss": 0.1001,
"step": 1160
},
{
"epoch": 0.8321479374110953,
"grad_norm": 2.520530993255376,
"learning_rate": 9.074552014158994e-06,
"loss": 0.1193,
"step": 1170
},
{
"epoch": 0.8392603129445235,
"grad_norm": 2.583475505711053,
"learning_rate": 9.050429265139647e-06,
"loss": 0.1122,
"step": 1180
},
{
"epoch": 0.8463726884779517,
"grad_norm": 3.0551608668064736,
"learning_rate": 9.026029090918076e-06,
"loss": 0.1345,
"step": 1190
},
{
"epoch": 0.8534850640113798,
"grad_norm": 2.7079152732306917,
"learning_rate": 9.001353162730297e-06,
"loss": 0.1134,
"step": 1200
},
{
"epoch": 0.8605974395448079,
"grad_norm": 2.2611924634890075,
"learning_rate": 8.976403170699486e-06,
"loss": 0.1026,
"step": 1210
},
{
"epoch": 0.8677098150782361,
"grad_norm": 1.987002883566529,
"learning_rate": 8.951180823720212e-06,
"loss": 0.0967,
"step": 1220
},
{
"epoch": 0.8748221906116643,
"grad_norm": 2.7967317585114615,
"learning_rate": 8.925687849341398e-06,
"loss": 0.0819,
"step": 1230
},
{
"epoch": 0.8819345661450925,
"grad_norm": 2.65859268119004,
"learning_rate": 8.899925993647994e-06,
"loss": 0.0931,
"step": 1240
},
{
"epoch": 0.8890469416785206,
"grad_norm": 2.5541801040927226,
"learning_rate": 8.873897021141378e-06,
"loss": 0.0888,
"step": 1250
},
{
"epoch": 0.8961593172119487,
"grad_norm": 2.6513786896328413,
"learning_rate": 8.847602714618504e-06,
"loss": 0.0839,
"step": 1260
},
{
"epoch": 0.903271692745377,
"grad_norm": 2.3497391818693587,
"learning_rate": 8.821044875049796e-06,
"loss": 0.0878,
"step": 1270
},
{
"epoch": 0.9103840682788051,
"grad_norm": 2.067880100094928,
"learning_rate": 8.794225321455788e-06,
"loss": 0.0866,
"step": 1280
},
{
"epoch": 0.9174964438122333,
"grad_norm": 2.466600341108382,
"learning_rate": 8.767145890782542e-06,
"loss": 0.0849,
"step": 1290
},
{
"epoch": 0.9246088193456614,
"grad_norm": 2.694537159823399,
"learning_rate": 8.739808437775825e-06,
"loss": 0.0773,
"step": 1300
},
{
"epoch": 0.9317211948790897,
"grad_norm": 3.051119356918663,
"learning_rate": 8.71221483485407e-06,
"loss": 0.0887,
"step": 1310
},
{
"epoch": 0.9388335704125178,
"grad_norm": 2.091226963672429,
"learning_rate": 8.684366971980139e-06,
"loss": 0.0739,
"step": 1320
},
{
"epoch": 0.9459459459459459,
"grad_norm": 2.6573993659558885,
"learning_rate": 8.656266756531857e-06,
"loss": 0.0757,
"step": 1330
},
{
"epoch": 0.9530583214793741,
"grad_norm": 2.5135440840845593,
"learning_rate": 8.627916113171396e-06,
"loss": 0.0695,
"step": 1340
},
{
"epoch": 0.9601706970128022,
"grad_norm": 1.8647689285533582,
"learning_rate": 8.599316983713419e-06,
"loss": 0.0703,
"step": 1350
},
{
"epoch": 0.9672830725462305,
"grad_norm": 2.1656321527764444,
"learning_rate": 8.570471326992105e-06,
"loss": 0.062,
"step": 1360
},
{
"epoch": 0.9743954480796586,
"grad_norm": 2.705238359384965,
"learning_rate": 8.54138111872697e-06,
"loss": 0.0755,
"step": 1370
},
{
"epoch": 0.9815078236130867,
"grad_norm": 1.4926114349562027,
"learning_rate": 8.512048351387551e-06,
"loss": 0.0656,
"step": 1380
},
{
"epoch": 0.9886201991465149,
"grad_norm": 2.193183643997932,
"learning_rate": 8.482475034056927e-06,
"loss": 0.0659,
"step": 1390
},
{
"epoch": 0.9957325746799431,
"grad_norm": 2.0527279052017264,
"learning_rate": 8.452663192294121e-06,
"loss": 0.0576,
"step": 1400
},
{
"epoch": 1.0028449502133712,
"grad_norm": 2.043379604895136,
"learning_rate": 8.42261486799536e-06,
"loss": 0.0518,
"step": 1410
},
{
"epoch": 1.0099573257467995,
"grad_norm": 1.7935460456418109,
"learning_rate": 8.392332119254214e-06,
"loss": 0.0363,
"step": 1420
},
{
"epoch": 1.0170697012802277,
"grad_norm": 1.9591421706180754,
"learning_rate": 8.361817020220647e-06,
"loss": 0.0345,
"step": 1430
},
{
"epoch": 1.0241820768136558,
"grad_norm": 1.904127146547918,
"learning_rate": 8.331071660958936e-06,
"loss": 0.039,
"step": 1440
},
{
"epoch": 1.031294452347084,
"grad_norm": 1.8927150070468237,
"learning_rate": 8.300098147304523e-06,
"loss": 0.0365,
"step": 1450
},
{
"epoch": 1.038406827880512,
"grad_norm": 1.9578224146696355,
"learning_rate": 8.268898600719785e-06,
"loss": 0.0431,
"step": 1460
},
{
"epoch": 1.0455192034139402,
"grad_norm": 2.119890142949488,
"learning_rate": 8.237475158148724e-06,
"loss": 0.0429,
"step": 1470
},
{
"epoch": 1.0526315789473684,
"grad_norm": 1.9482483964200852,
"learning_rate": 8.205829971870602e-06,
"loss": 0.0397,
"step": 1480
},
{
"epoch": 1.0597439544807965,
"grad_norm": 1.7329874393672655,
"learning_rate": 8.173965209352524e-06,
"loss": 0.0344,
"step": 1490
},
{
"epoch": 1.0668563300142249,
"grad_norm": 1.8911139378477928,
"learning_rate": 8.14188305310099e-06,
"loss": 0.0464,
"step": 1500
},
{
"epoch": 1.073968705547653,
"grad_norm": 2.450233012383526,
"learning_rate": 8.109585700512395e-06,
"loss": 0.0375,
"step": 1510
},
{
"epoch": 1.0810810810810811,
"grad_norm": 2.0138094788301166,
"learning_rate": 8.077075363722542e-06,
"loss": 0.0389,
"step": 1520
},
{
"epoch": 1.0881934566145093,
"grad_norm": 2.076572644222088,
"learning_rate": 8.044354269455109e-06,
"loss": 0.0436,
"step": 1530
},
{
"epoch": 1.0953058321479374,
"grad_norm": 1.9101229450735917,
"learning_rate": 8.011424658869142e-06,
"loss": 0.0357,
"step": 1540
},
{
"epoch": 1.1024182076813656,
"grad_norm": 1.130649417703215,
"learning_rate": 7.978288787405556e-06,
"loss": 0.0362,
"step": 1550
},
{
"epoch": 1.1095305832147937,
"grad_norm": 1.1581533245467266,
"learning_rate": 7.944948924632643e-06,
"loss": 0.0345,
"step": 1560
},
{
"epoch": 1.1166429587482218,
"grad_norm": 1.6643524677849526,
"learning_rate": 7.911407354090634e-06,
"loss": 0.0354,
"step": 1570
},
{
"epoch": 1.12375533428165,
"grad_norm": 1.9726198917599644,
"learning_rate": 7.877666373135287e-06,
"loss": 0.0346,
"step": 1580
},
{
"epoch": 1.1308677098150781,
"grad_norm": 1.6692436200631287,
"learning_rate": 7.84372829278053e-06,
"loss": 0.038,
"step": 1590
},
{
"epoch": 1.1379800853485065,
"grad_norm": 1.7045565380565189,
"learning_rate": 7.809595437540189e-06,
"loss": 0.0327,
"step": 1600
},
{
"epoch": 1.1450924608819346,
"grad_norm": 1.9976160352568044,
"learning_rate": 7.775270145268755e-06,
"loss": 0.0256,
"step": 1610
},
{
"epoch": 1.1522048364153628,
"grad_norm": 1.3781171703418404,
"learning_rate": 7.740754767001278e-06,
"loss": 0.039,
"step": 1620
},
{
"epoch": 1.159317211948791,
"grad_norm": 1.675366937408603,
"learning_rate": 7.706051666792318e-06,
"loss": 0.0353,
"step": 1630
},
{
"epoch": 1.166429587482219,
"grad_norm": 1.5507760610752672,
"learning_rate": 7.671163221554043e-06,
"loss": 0.0353,
"step": 1640
},
{
"epoch": 1.1735419630156472,
"grad_norm": 1.5578057994726024,
"learning_rate": 7.636091820893417e-06,
"loss": 0.0374,
"step": 1650
},
{
"epoch": 1.1806543385490753,
"grad_norm": 1.9536673456849045,
"learning_rate": 7.600839866948528e-06,
"loss": 0.0363,
"step": 1660
},
{
"epoch": 1.1877667140825037,
"grad_norm": 1.4180294508669007,
"learning_rate": 7.565409774224066e-06,
"loss": 0.0349,
"step": 1670
},
{
"epoch": 1.1948790896159318,
"grad_norm": 1.6616296432221909,
"learning_rate": 7.529803969425941e-06,
"loss": 0.0307,
"step": 1680
},
{
"epoch": 1.20199146514936,
"grad_norm": 1.7138246686303804,
"learning_rate": 7.494024891295075e-06,
"loss": 0.0322,
"step": 1690
},
{
"epoch": 1.209103840682788,
"grad_norm": 1.3613855884690513,
"learning_rate": 7.458074990440363e-06,
"loss": 0.0293,
"step": 1700
},
{
"epoch": 1.2162162162162162,
"grad_norm": 2.4114521805394205,
"learning_rate": 7.421956729170823e-06,
"loss": 0.0344,
"step": 1710
},
{
"epoch": 1.2233285917496444,
"grad_norm": 1.9233612034450194,
"learning_rate": 7.385672581326954e-06,
"loss": 0.0351,
"step": 1720
},
{
"epoch": 1.2304409672830725,
"grad_norm": 1.7307194070590812,
"learning_rate": 7.34922503211128e-06,
"loss": 0.0353,
"step": 1730
},
{
"epoch": 1.2375533428165006,
"grad_norm": 1.468735660134803,
"learning_rate": 7.312616577918149e-06,
"loss": 0.03,
"step": 1740
},
{
"epoch": 1.2446657183499288,
"grad_norm": 0.9815553395553774,
"learning_rate": 7.2758497261627345e-06,
"loss": 0.0267,
"step": 1750
},
{
"epoch": 1.251778093883357,
"grad_norm": 1.4851270984075178,
"learning_rate": 7.238926995109306e-06,
"loss": 0.0288,
"step": 1760
},
{
"epoch": 1.2588904694167853,
"grad_norm": 2.2537032746619183,
"learning_rate": 7.201850913698736e-06,
"loss": 0.0364,
"step": 1770
},
{
"epoch": 1.2660028449502134,
"grad_norm": 1.454211009387941,
"learning_rate": 7.164624021375294e-06,
"loss": 0.0252,
"step": 1780
},
{
"epoch": 1.2731152204836416,
"grad_norm": 1.4034123768391151,
"learning_rate": 7.12724886791271e-06,
"loss": 0.0266,
"step": 1790
},
{
"epoch": 1.2802275960170697,
"grad_norm": 1.546526107411268,
"learning_rate": 7.08972801323953e-06,
"loss": 0.03,
"step": 1800
},
{
"epoch": 1.2873399715504978,
"grad_norm": 1.6929689381873503,
"learning_rate": 7.052064027263785e-06,
"loss": 0.0235,
"step": 1810
},
{
"epoch": 1.294452347083926,
"grad_norm": 1.5130921744879449,
"learning_rate": 7.014259489696968e-06,
"loss": 0.0243,
"step": 1820
},
{
"epoch": 1.3015647226173541,
"grad_norm": 1.9572718096346318,
"learning_rate": 6.976316989877343e-06,
"loss": 0.0249,
"step": 1830
},
{
"epoch": 1.3086770981507825,
"grad_norm": 1.2611303057850376,
"learning_rate": 6.938239126592592e-06,
"loss": 0.0263,
"step": 1840
},
{
"epoch": 1.3157894736842106,
"grad_norm": 1.2902816153314383,
"learning_rate": 6.90002850790182e-06,
"loss": 0.0298,
"step": 1850
},
{
"epoch": 1.3229018492176388,
"grad_norm": 0.9719782814773048,
"learning_rate": 6.861687750956922e-06,
"loss": 0.027,
"step": 1860
},
{
"epoch": 1.330014224751067,
"grad_norm": 1.1718631838309244,
"learning_rate": 6.823219481823318e-06,
"loss": 0.0245,
"step": 1870
},
{
"epoch": 1.337126600284495,
"grad_norm": 1.3461970346065844,
"learning_rate": 6.784626335300102e-06,
"loss": 0.0198,
"step": 1880
},
{
"epoch": 1.3442389758179232,
"grad_norm": 1.1445639186428003,
"learning_rate": 6.745910954739563e-06,
"loss": 0.0274,
"step": 1890
},
{
"epoch": 1.3513513513513513,
"grad_norm": 1.9649035858601103,
"learning_rate": 6.707075991866143e-06,
"loss": 0.0268,
"step": 1900
},
{
"epoch": 1.3584637268847795,
"grad_norm": 1.3779682004442027,
"learning_rate": 6.668124106594813e-06,
"loss": 0.0274,
"step": 1910
},
{
"epoch": 1.3655761024182076,
"grad_norm": 0.9339287727084011,
"learning_rate": 6.629057966848879e-06,
"loss": 0.0244,
"step": 1920
},
{
"epoch": 1.3726884779516357,
"grad_norm": 1.3418194746364869,
"learning_rate": 6.589880248377258e-06,
"loss": 0.023,
"step": 1930
},
{
"epoch": 1.379800853485064,
"grad_norm": 1.6101698103903805,
"learning_rate": 6.550593634571205e-06,
"loss": 0.018,
"step": 1940
},
{
"epoch": 1.3869132290184922,
"grad_norm": 1.7415141112043047,
"learning_rate": 6.511200816280523e-06,
"loss": 0.021,
"step": 1950
},
{
"epoch": 1.3940256045519204,
"grad_norm": 1.2100486434644262,
"learning_rate": 6.471704491629251e-06,
"loss": 0.0285,
"step": 1960
},
{
"epoch": 1.4011379800853485,
"grad_norm": 1.301261422264456,
"learning_rate": 6.432107365830872e-06,
"loss": 0.0198,
"step": 1970
},
{
"epoch": 1.4082503556187767,
"grad_norm": 1.3543714484816034,
"learning_rate": 6.392412151003019e-06,
"loss": 0.0244,
"step": 1980
},
{
"epoch": 1.4153627311522048,
"grad_norm": 1.4893305665999936,
"learning_rate": 6.3526215659817156e-06,
"loss": 0.0226,
"step": 1990
},
{
"epoch": 1.422475106685633,
"grad_norm": 1.1217736569772296,
"learning_rate": 6.312738336135159e-06,
"loss": 0.019,
"step": 2000
},
{
"epoch": 1.4295874822190613,
"grad_norm": 1.530506526795571,
"learning_rate": 6.272765193177044e-06,
"loss": 0.0196,
"step": 2010
},
{
"epoch": 1.4366998577524894,
"grad_norm": 1.1830746085813704,
"learning_rate": 6.23270487497947e-06,
"loss": 0.0189,
"step": 2020
},
{
"epoch": 1.4438122332859176,
"grad_norm": 1.3714016439826322,
"learning_rate": 6.192560125385412e-06,
"loss": 0.025,
"step": 2030
},
{
"epoch": 1.4509246088193457,
"grad_norm": 1.1129988250796872,
"learning_rate": 6.152333694020781e-06,
"loss": 0.0184,
"step": 2040
},
{
"epoch": 1.4580369843527738,
"grad_norm": 2.0430785612059346,
"learning_rate": 6.112028336106108e-06,
"loss": 0.023,
"step": 2050
},
{
"epoch": 1.465149359886202,
"grad_norm": 1.4200748013522733,
"learning_rate": 6.071646812267817e-06,
"loss": 0.0167,
"step": 2060
},
{
"epoch": 1.4722617354196301,
"grad_norm": 1.8027434372189237,
"learning_rate": 6.031191888349155e-06,
"loss": 0.0202,
"step": 2070
},
{
"epoch": 1.4793741109530583,
"grad_norm": 1.1171787456661884,
"learning_rate": 5.990666335220738e-06,
"loss": 0.0178,
"step": 2080
},
{
"epoch": 1.4864864864864864,
"grad_norm": 1.6452874612147976,
"learning_rate": 5.950072928590781e-06,
"loss": 0.018,
"step": 2090
},
{
"epoch": 1.4935988620199145,
"grad_norm": 0.9884439749765455,
"learning_rate": 5.909414448814971e-06,
"loss": 0.0209,
"step": 2100
},
{
"epoch": 1.5007112375533427,
"grad_norm": 1.554996157376441,
"learning_rate": 5.8686936807060335e-06,
"loss": 0.0192,
"step": 2110
},
{
"epoch": 1.5078236130867708,
"grad_norm": 1.0929475144672365,
"learning_rate": 5.827913413343003e-06,
"loss": 0.018,
"step": 2120
},
{
"epoch": 1.5149359886201992,
"grad_norm": 1.0492081159201816,
"learning_rate": 5.787076439880177e-06,
"loss": 0.0179,
"step": 2130
},
{
"epoch": 1.5220483641536273,
"grad_norm": 1.2333928332291602,
"learning_rate": 5.746185557355814e-06,
"loss": 0.0211,
"step": 2140
},
{
"epoch": 1.5291607396870555,
"grad_norm": 0.8940904857757537,
"learning_rate": 5.70524356650056e-06,
"loss": 0.0168,
"step": 2150
},
{
"epoch": 1.5362731152204836,
"grad_norm": 0.9594678027850269,
"learning_rate": 5.664253271545603e-06,
"loss": 0.0172,
"step": 2160
},
{
"epoch": 1.543385490753912,
"grad_norm": 1.133529225026687,
"learning_rate": 5.623217480030622e-06,
"loss": 0.0178,
"step": 2170
},
{
"epoch": 1.55049786628734,
"grad_norm": 1.0245366404113008,
"learning_rate": 5.58213900261148e-06,
"loss": 0.0135,
"step": 2180
},
{
"epoch": 1.5576102418207682,
"grad_norm": 0.7068889699880522,
"learning_rate": 5.541020652867713e-06,
"loss": 0.0153,
"step": 2190
},
{
"epoch": 1.5647226173541964,
"grad_norm": 1.2084727884034199,
"learning_rate": 5.49986524710983e-06,
"loss": 0.0143,
"step": 2200
},
{
"epoch": 1.5718349928876245,
"grad_norm": 1.5054621892964164,
"learning_rate": 5.4586756041864065e-06,
"loss": 0.016,
"step": 2210
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.4176580158063212,
"learning_rate": 5.417454545291017e-06,
"loss": 0.0168,
"step": 2220
},
{
"epoch": 1.5860597439544808,
"grad_norm": 1.1824924291702557,
"learning_rate": 5.376204893769e-06,
"loss": 0.0198,
"step": 2230
},
{
"epoch": 1.593172119487909,
"grad_norm": 1.7631808589665254,
"learning_rate": 5.334929474924093e-06,
"loss": 0.0155,
"step": 2240
},
{
"epoch": 1.600284495021337,
"grad_norm": 1.215149372258629,
"learning_rate": 5.293631115824897e-06,
"loss": 0.0138,
"step": 2250
},
{
"epoch": 1.6073968705547652,
"grad_norm": 1.718329335563461,
"learning_rate": 5.252312645111266e-06,
"loss": 0.0173,
"step": 2260
},
{
"epoch": 1.6145092460881934,
"grad_norm": 1.0751615799620988,
"learning_rate": 5.2109768928005454e-06,
"loss": 0.0142,
"step": 2270
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.8027120709435296,
"learning_rate": 5.169626690093751e-06,
"loss": 0.014,
"step": 2280
},
{
"epoch": 1.6287339971550496,
"grad_norm": 1.6699231722730825,
"learning_rate": 5.128264869181646e-06,
"loss": 0.0127,
"step": 2290
},
{
"epoch": 1.635846372688478,
"grad_norm": 1.2559995566307685,
"learning_rate": 5.086894263050755e-06,
"loss": 0.011,
"step": 2300
},
{
"epoch": 1.6429587482219061,
"grad_norm": 1.349960059022035,
"learning_rate": 5.045517705289328e-06,
"loss": 0.0111,
"step": 2310
},
{
"epoch": 1.6500711237553343,
"grad_norm": 0.8142603267011976,
"learning_rate": 5.004138029893257e-06,
"loss": 0.0138,
"step": 2320
},
{
"epoch": 1.6571834992887624,
"grad_norm": 1.0621437820203163,
"learning_rate": 4.9627580710719734e-06,
"loss": 0.0128,
"step": 2330
},
{
"epoch": 1.6642958748221908,
"grad_norm": 1.7262184368035551,
"learning_rate": 4.921380663054318e-06,
"loss": 0.0128,
"step": 2340
},
{
"epoch": 1.671408250355619,
"grad_norm": 1.2695847947859624,
"learning_rate": 4.880008639894421e-06,
"loss": 0.014,
"step": 2350
},
{
"epoch": 1.678520625889047,
"grad_norm": 0.9261536386806662,
"learning_rate": 4.838644835277585e-06,
"loss": 0.0144,
"step": 2360
},
{
"epoch": 1.6856330014224752,
"grad_norm": 0.6867762051400554,
"learning_rate": 4.79729208232621e-06,
"loss": 0.0109,
"step": 2370
},
{
"epoch": 1.6927453769559033,
"grad_norm": 0.6232870542134327,
"learning_rate": 4.75595321340573e-06,
"loss": 0.0122,
"step": 2380
},
{
"epoch": 1.6998577524893315,
"grad_norm": 0.970176828182309,
"learning_rate": 4.714631059930622e-06,
"loss": 0.012,
"step": 2390
},
{
"epoch": 1.7069701280227596,
"grad_norm": 1.6173382913062293,
"learning_rate": 4.6733284521704816e-06,
"loss": 0.0124,
"step": 2400
},
{
"epoch": 1.7140825035561877,
"grad_norm": 0.9844171855603,
"learning_rate": 4.632048219056159e-06,
"loss": 0.012,
"step": 2410
},
{
"epoch": 1.7211948790896159,
"grad_norm": 1.3183824382551952,
"learning_rate": 4.590793187986003e-06,
"loss": 0.0149,
"step": 2420
},
{
"epoch": 1.728307254623044,
"grad_norm": 0.5730734000902559,
"learning_rate": 4.549566184632206e-06,
"loss": 0.0117,
"step": 2430
},
{
"epoch": 1.7354196301564722,
"grad_norm": 0.9239894283732394,
"learning_rate": 4.508370032747261e-06,
"loss": 0.0092,
"step": 2440
},
{
"epoch": 1.7425320056899003,
"grad_norm": 0.9732516534559529,
"learning_rate": 4.467207553970564e-06,
"loss": 0.012,
"step": 2450
},
{
"epoch": 1.7496443812233284,
"grad_norm": 0.9139268416210883,
"learning_rate": 4.426081567635137e-06,
"loss": 0.0092,
"step": 2460
},
{
"epoch": 1.7567567567567568,
"grad_norm": 1.2921223854630304,
"learning_rate": 4.3849948905745385e-06,
"loss": 0.0137,
"step": 2470
},
{
"epoch": 1.763869132290185,
"grad_norm": 0.8703692417885042,
"learning_rate": 4.343950336929927e-06,
"loss": 0.0095,
"step": 2480
},
{
"epoch": 1.770981507823613,
"grad_norm": 0.9536442700427114,
"learning_rate": 4.302950717957304e-06,
"loss": 0.0098,
"step": 2490
},
{
"epoch": 1.7780938833570412,
"grad_norm": 0.852536162993322,
"learning_rate": 4.261998841834972e-06,
"loss": 0.0101,
"step": 2500
},
{
"epoch": 1.7852062588904696,
"grad_norm": 1.248725823462744,
"learning_rate": 4.221097513471199e-06,
"loss": 0.0094,
"step": 2510
},
{
"epoch": 1.7923186344238977,
"grad_norm": 0.487586863686056,
"learning_rate": 4.18024953431209e-06,
"loss": 0.009,
"step": 2520
},
{
"epoch": 1.7994310099573259,
"grad_norm": 0.6857485925261184,
"learning_rate": 4.13945770214971e-06,
"loss": 0.0098,
"step": 2530
},
{
"epoch": 1.806543385490754,
"grad_norm": 0.5224101041795471,
"learning_rate": 4.098724810930472e-06,
"loss": 0.0077,
"step": 2540
},
{
"epoch": 1.8136557610241821,
"grad_norm": 0.3255236838052598,
"learning_rate": 4.058053650563747e-06,
"loss": 0.0069,
"step": 2550
},
{
"epoch": 1.8207681365576103,
"grad_norm": 0.5535169044707119,
"learning_rate": 4.017447006730796e-06,
"loss": 0.0084,
"step": 2560
},
{
"epoch": 1.8278805120910384,
"grad_norm": 0.6587680546008802,
"learning_rate": 3.976907660693954e-06,
"loss": 0.0068,
"step": 2570
},
{
"epoch": 1.8349928876244666,
"grad_norm": 0.7451030339766666,
"learning_rate": 3.936438389106154e-06,
"loss": 0.0091,
"step": 2580
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.7854707802079127,
"learning_rate": 3.896041963820724e-06,
"loss": 0.0105,
"step": 2590
},
{
"epoch": 1.8492176386913228,
"grad_norm": 0.6990927586140553,
"learning_rate": 3.855721151701548e-06,
"loss": 0.0099,
"step": 2600
},
{
"epoch": 1.856330014224751,
"grad_norm": 1.318630670215527,
"learning_rate": 3.815478714433559e-06,
"loss": 0.0095,
"step": 2610
},
{
"epoch": 1.863442389758179,
"grad_norm": 0.8518153474787149,
"learning_rate": 3.775317408333571e-06,
"loss": 0.0105,
"step": 2620
},
{
"epoch": 1.8705547652916072,
"grad_norm": 1.0023735620026466,
"learning_rate": 3.7352399841614996e-06,
"loss": 0.0082,
"step": 2630
},
{
"epoch": 1.8776671408250356,
"grad_norm": 0.9809887806472293,
"learning_rate": 3.695249186931954e-06,
"loss": 0.0087,
"step": 2640
},
{
"epoch": 1.8847795163584637,
"grad_norm": 0.9540456428445807,
"learning_rate": 3.655347755726224e-06,
"loss": 0.0076,
"step": 2650
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.7066159412282622,
"learning_rate": 3.6155384235046674e-06,
"loss": 0.0086,
"step": 2660
},
{
"epoch": 1.89900426742532,
"grad_norm": 0.5137592216850851,
"learning_rate": 3.5758239169195276e-06,
"loss": 0.005,
"step": 2670
},
{
"epoch": 1.9061166429587484,
"grad_norm": 0.3439517878091387,
"learning_rate": 3.5362069561281764e-06,
"loss": 0.0072,
"step": 2680
},
{
"epoch": 1.9132290184921765,
"grad_norm": 0.3970319267325305,
"learning_rate": 3.4966902546068016e-06,
"loss": 0.0072,
"step": 2690
},
{
"epoch": 1.9203413940256047,
"grad_norm": 0.9810798909167313,
"learning_rate": 3.4572765189645516e-06,
"loss": 0.0073,
"step": 2700
},
{
"epoch": 1.9274537695590328,
"grad_norm": 1.4872117479815739,
"learning_rate": 3.4179684487581555e-06,
"loss": 0.0067,
"step": 2710
},
{
"epoch": 1.934566145092461,
"grad_norm": 0.17941271447530188,
"learning_rate": 3.3787687363070256e-06,
"loss": 0.0075,
"step": 2720
},
{
"epoch": 1.941678520625889,
"grad_norm": 0.21377268278340267,
"learning_rate": 3.3396800665088435e-06,
"loss": 0.0069,
"step": 2730
},
{
"epoch": 1.9487908961593172,
"grad_norm": 0.8027020001474104,
"learning_rate": 3.300705116655672e-06,
"loss": 0.0058,
"step": 2740
},
{
"epoch": 1.9559032716927454,
"grad_norm": 0.607769605088779,
"learning_rate": 3.26184655625058e-06,
"loss": 0.0055,
"step": 2750
},
{
"epoch": 1.9630156472261735,
"grad_norm": 0.29396831979764293,
"learning_rate": 3.2231070468247954e-06,
"loss": 0.0062,
"step": 2760
},
{
"epoch": 1.9701280227596016,
"grad_norm": 0.49083863249583537,
"learning_rate": 3.1844892417554102e-06,
"loss": 0.0063,
"step": 2770
},
{
"epoch": 1.9772403982930298,
"grad_norm": 0.710753958854101,
"learning_rate": 3.1459957860836528e-06,
"loss": 0.0065,
"step": 2780
},
{
"epoch": 1.984352773826458,
"grad_norm": 0.27012727932102704,
"learning_rate": 3.1076293163337074e-06,
"loss": 0.0068,
"step": 2790
},
{
"epoch": 1.991465149359886,
"grad_norm": 0.34603765606499187,
"learning_rate": 3.069392460332141e-06,
"loss": 0.0057,
"step": 2800
},
{
"epoch": 1.9985775248933144,
"grad_norm": 0.3721250969176249,
"learning_rate": 3.031287837027911e-06,
"loss": 0.0066,
"step": 2810
},
{
"epoch": 2.0056899004267423,
"grad_norm": 0.781768421432185,
"learning_rate": 2.9933180563129936e-06,
"loss": 0.0041,
"step": 2820
},
{
"epoch": 2.012802275960171,
"grad_norm": 0.24350008390092337,
"learning_rate": 2.955485718843616e-06,
"loss": 0.0056,
"step": 2830
},
{
"epoch": 2.019914651493599,
"grad_norm": 0.4576741832894929,
"learning_rate": 2.917793415862129e-06,
"loss": 0.0048,
"step": 2840
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.9890835980780475,
"learning_rate": 2.880243729019546e-06,
"loss": 0.0038,
"step": 2850
},
{
"epoch": 2.0341394025604553,
"grad_norm": 0.3917033136267895,
"learning_rate": 2.842839230198685e-06,
"loss": 0.0052,
"step": 2860
},
{
"epoch": 2.0412517780938835,
"grad_norm": 0.12450209954114903,
"learning_rate": 2.805582481338044e-06,
"loss": 0.0047,
"step": 2870
},
{
"epoch": 2.0483641536273116,
"grad_norm": 0.5486661654701261,
"learning_rate": 2.7684760342563045e-06,
"loss": 0.0047,
"step": 2880
},
{
"epoch": 2.0554765291607398,
"grad_norm": 0.22758726780410876,
"learning_rate": 2.731522430477571e-06,
"loss": 0.0056,
"step": 2890
},
{
"epoch": 2.062588904694168,
"grad_norm": 0.2218164583744802,
"learning_rate": 2.694724201057273e-06,
"loss": 0.0048,
"step": 2900
},
{
"epoch": 2.069701280227596,
"grad_norm": 0.45353402328041514,
"learning_rate": 2.6580838664088214e-06,
"loss": 0.0042,
"step": 2910
},
{
"epoch": 2.076813655761024,
"grad_norm": 0.29165554258590237,
"learning_rate": 2.6216039361309753e-06,
"loss": 0.0044,
"step": 2920
},
{
"epoch": 2.0839260312944523,
"grad_norm": 0.42787997336579114,
"learning_rate": 2.5852869088359495e-06,
"loss": 0.0041,
"step": 2930
},
{
"epoch": 2.0910384068278804,
"grad_norm": 0.44323215466285076,
"learning_rate": 2.549135271978275e-06,
"loss": 0.0032,
"step": 2940
},
{
"epoch": 2.0981507823613086,
"grad_norm": 0.1143123602309504,
"learning_rate": 2.5131515016844345e-06,
"loss": 0.0046,
"step": 2950
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.16583828479799412,
"learning_rate": 2.4773380625832603e-06,
"loss": 0.0047,
"step": 2960
},
{
"epoch": 2.112375533428165,
"grad_norm": 0.15755302830922696,
"learning_rate": 2.4416974076371304e-06,
"loss": 0.0039,
"step": 2970
},
{
"epoch": 2.119487908961593,
"grad_norm": 0.62834650400931,
"learning_rate": 2.406231977973942e-06,
"loss": 0.0037,
"step": 2980
},
{
"epoch": 2.126600284495021,
"grad_norm": 0.3425562134173693,
"learning_rate": 2.3709442027199387e-06,
"loss": 0.0049,
"step": 2990
},
{
"epoch": 2.1337126600284497,
"grad_norm": 0.1176241490475843,
"learning_rate": 2.3358364988333066e-06,
"loss": 0.0045,
"step": 3000
},
{
"epoch": 2.140825035561878,
"grad_norm": 0.21718467446163836,
"learning_rate": 2.3009112709386454e-06,
"loss": 0.0052,
"step": 3010
},
{
"epoch": 2.147937411095306,
"grad_norm": 0.1447042548468856,
"learning_rate": 2.2661709111622666e-06,
"loss": 0.0047,
"step": 3020
},
{
"epoch": 2.155049786628734,
"grad_norm": 0.2850367854449551,
"learning_rate": 2.2316177989683458e-06,
"loss": 0.004,
"step": 3030
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.33564220562935804,
"learning_rate": 2.197254300995953e-06,
"loss": 0.0052,
"step": 3040
},
{
"epoch": 2.1692745376955904,
"grad_norm": 0.1545067926251289,
"learning_rate": 2.163082770896943e-06,
"loss": 0.0043,
"step": 3050
},
{
"epoch": 2.1763869132290186,
"grad_norm": 0.08868335935281069,
"learning_rate": 2.1291055491747643e-06,
"loss": 0.0034,
"step": 3060
},
{
"epoch": 2.1834992887624467,
"grad_norm": 0.0678499455537346,
"learning_rate": 2.095324963024137e-06,
"loss": 0.0039,
"step": 3070
},
{
"epoch": 2.190611664295875,
"grad_norm": 0.1962461433328382,
"learning_rate": 2.061743326171668e-06,
"loss": 0.0038,
"step": 3080
},
{
"epoch": 2.197724039829303,
"grad_norm": 0.07801886707618137,
"learning_rate": 2.02836293871736e-06,
"loss": 0.0035,
"step": 3090
},
{
"epoch": 2.204836415362731,
"grad_norm": 0.3629078506453925,
"learning_rate": 1.9951860869771e-06,
"loss": 0.0038,
"step": 3100
},
{
"epoch": 2.2119487908961593,
"grad_norm": 0.8806588814039079,
"learning_rate": 1.962215043326029e-06,
"loss": 0.004,
"step": 3110
},
{
"epoch": 2.2190611664295874,
"grad_norm": 0.33169199243250613,
"learning_rate": 1.9294520660429284e-06,
"loss": 0.0036,
"step": 3120
},
{
"epoch": 2.2261735419630155,
"grad_norm": 0.12310821458251077,
"learning_rate": 1.8968993991555301e-06,
"loss": 0.0045,
"step": 3130
},
{
"epoch": 2.2332859174964437,
"grad_norm": 0.1564234234161847,
"learning_rate": 1.8645592722868223e-06,
"loss": 0.0041,
"step": 3140
},
{
"epoch": 2.240398293029872,
"grad_norm": 0.1908716606221835,
"learning_rate": 1.8324339005023273e-06,
"loss": 0.0042,
"step": 3150
},
{
"epoch": 2.2475106685633,
"grad_norm": 0.17491525199519603,
"learning_rate": 1.8005254841584035e-06,
"loss": 0.0032,
"step": 3160
},
{
"epoch": 2.2546230440967285,
"grad_norm": 0.15681019357467124,
"learning_rate": 1.768836208751516e-06,
"loss": 0.0039,
"step": 3170
},
{
"epoch": 2.2617354196301562,
"grad_norm": 0.16172138112249296,
"learning_rate": 1.7373682447685624e-06,
"loss": 0.004,
"step": 3180
},
{
"epoch": 2.268847795163585,
"grad_norm": 0.10575834882863448,
"learning_rate": 1.706123747538196e-06,
"loss": 0.0035,
"step": 3190
},
{
"epoch": 2.275960170697013,
"grad_norm": 0.18222310954574267,
"learning_rate": 1.6751048570832184e-06,
"loss": 0.0041,
"step": 3200
},
{
"epoch": 2.283072546230441,
"grad_norm": 0.14875677905536833,
"learning_rate": 1.6443136979739855e-06,
"loss": 0.003,
"step": 3210
},
{
"epoch": 2.2901849217638692,
"grad_norm": 0.10898246145730768,
"learning_rate": 1.6137523791829007e-06,
"loss": 0.0034,
"step": 3220
},
{
"epoch": 2.2972972972972974,
"grad_norm": 0.1309461753215428,
"learning_rate": 1.5834229939399637e-06,
"loss": 0.0034,
"step": 3230
},
{
"epoch": 2.3044096728307255,
"grad_norm": 0.07200423508178247,
"learning_rate": 1.5533276195893987e-06,
"loss": 0.0037,
"step": 3240
},
{
"epoch": 2.3115220483641536,
"grad_norm": 0.28943328560772674,
"learning_rate": 1.5234683174473669e-06,
"loss": 0.0039,
"step": 3250
},
{
"epoch": 2.318634423897582,
"grad_norm": 0.5192612699526135,
"learning_rate": 1.493847132660789e-06,
"loss": 0.0034,
"step": 3260
},
{
"epoch": 2.32574679943101,
"grad_norm": 0.1606295965015448,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0044,
"step": 3270
},
{
"epoch": 2.332859174964438,
"grad_norm": 0.37034704670980706,
"learning_rate": 1.435327214056103e-06,
"loss": 0.0036,
"step": 3280
},
{
"epoch": 2.339971550497866,
"grad_norm": 0.1985714241377405,
"learning_rate": 1.406432488430508e-06,
"loss": 0.0041,
"step": 3290
},
{
"epoch": 2.3470839260312943,
"grad_norm": 0.13803180507649276,
"learning_rate": 1.3777838962708602e-06,
"loss": 0.0035,
"step": 3300
},
{
"epoch": 2.3541963015647225,
"grad_norm": 0.16321860803207505,
"learning_rate": 1.3493833997991745e-06,
"loss": 0.0033,
"step": 3310
},
{
"epoch": 2.3613086770981506,
"grad_norm": 0.2001811539323451,
"learning_rate": 1.3212329442446985e-06,
"loss": 0.0042,
"step": 3320
},
{
"epoch": 2.3684210526315788,
"grad_norm": 0.1453173744872287,
"learning_rate": 1.2933344577106822e-06,
"loss": 0.0032,
"step": 3330
},
{
"epoch": 2.3755334281650073,
"grad_norm": 0.10401910511567347,
"learning_rate": 1.2656898510423122e-06,
"loss": 0.0031,
"step": 3340
},
{
"epoch": 2.382645803698435,
"grad_norm": 0.10582948879092595,
"learning_rate": 1.2383010176958372e-06,
"loss": 0.0033,
"step": 3350
},
{
"epoch": 2.3897581792318636,
"grad_norm": 0.16511981732406306,
"learning_rate": 1.2111698336088717e-06,
"loss": 0.0039,
"step": 3360
},
{
"epoch": 2.3968705547652918,
"grad_norm": 0.14041169290258051,
"learning_rate": 1.1842981570719237e-06,
"loss": 0.0034,
"step": 3370
},
{
"epoch": 2.40398293029872,
"grad_norm": 0.216807318559693,
"learning_rate": 1.157687828601094e-06,
"loss": 0.0039,
"step": 3380
},
{
"epoch": 2.411095305832148,
"grad_norm": 0.1487410996270859,
"learning_rate": 1.1313406708120327e-06,
"loss": 0.0033,
"step": 3390
},
{
"epoch": 2.418207681365576,
"grad_norm": 0.17410715559913836,
"learning_rate": 1.1052584882950896e-06,
"loss": 0.0032,
"step": 3400
},
{
"epoch": 2.4253200568990043,
"grad_norm": 0.14679067077660998,
"learning_rate": 1.0794430674917262e-06,
"loss": 0.0029,
"step": 3410
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.11730320262217042,
"learning_rate": 1.0538961765721429e-06,
"loss": 0.0034,
"step": 3420
},
{
"epoch": 2.4395448079658606,
"grad_norm": 0.15601345944604691,
"learning_rate": 1.0286195653141822e-06,
"loss": 0.0033,
"step": 3430
},
{
"epoch": 2.4466571834992887,
"grad_norm": 0.15596374680032918,
"learning_rate": 1.0036149649834786e-06,
"loss": 0.0033,
"step": 3440
},
{
"epoch": 2.453769559032717,
"grad_norm": 0.15341222073346109,
"learning_rate": 9.788840882148803e-07,
"loss": 0.0032,
"step": 3450
},
{
"epoch": 2.460881934566145,
"grad_norm": 0.18113221503751906,
"learning_rate": 9.544286288951393e-07,
"loss": 0.0028,
"step": 3460
},
{
"epoch": 2.467994310099573,
"grad_norm": 0.23824252331061962,
"learning_rate": 9.302502620469073e-07,
"loss": 0.003,
"step": 3470
},
{
"epoch": 2.4751066856330013,
"grad_norm": 0.1804454838531882,
"learning_rate": 9.063506437139901e-07,
"loss": 0.0033,
"step": 3480
},
{
"epoch": 2.4822190611664294,
"grad_norm": 0.12129461355182411,
"learning_rate": 8.827314108479357e-07,
"loss": 0.0035,
"step": 3490
},
{
"epoch": 2.4893314366998576,
"grad_norm": 0.2496105490338266,
"learning_rate": 8.593941811959078e-07,
"loss": 0.0037,
"step": 3500
},
{
"epoch": 2.496443812233286,
"grad_norm": 0.12260976552880777,
"learning_rate": 8.363405531898833e-07,
"loss": 0.0035,
"step": 3510
},
{
"epoch": 2.503556187766714,
"grad_norm": 0.17068909040005176,
"learning_rate": 8.135721058371681e-07,
"loss": 0.0038,
"step": 3520
},
{
"epoch": 2.5106685633001424,
"grad_norm": 0.14486041747836928,
"learning_rate": 7.910903986122537e-07,
"loss": 0.0023,
"step": 3530
},
{
"epoch": 2.5177809388335706,
"grad_norm": 0.16537212820522457,
"learning_rate": 7.688969713499983e-07,
"loss": 0.0033,
"step": 3540
},
{
"epoch": 2.5248933143669987,
"grad_norm": 0.06547618532234573,
"learning_rate": 7.469933441401606e-07,
"loss": 0.0036,
"step": 3550
},
{
"epoch": 2.532005689900427,
"grad_norm": 0.09486129847604534,
"learning_rate": 7.253810172232867e-07,
"loss": 0.0029,
"step": 3560
},
{
"epoch": 2.539118065433855,
"grad_norm": 0.15420596551214288,
"learning_rate": 7.040614708879489e-07,
"loss": 0.0031,
"step": 3570
},
{
"epoch": 2.546230440967283,
"grad_norm": 0.18795827544823362,
"learning_rate": 6.830361653693673e-07,
"loss": 0.0031,
"step": 3580
},
{
"epoch": 2.5533428165007113,
"grad_norm": 0.20144541991501458,
"learning_rate": 6.623065407493801e-07,
"loss": 0.0031,
"step": 3590
},
{
"epoch": 2.5604551920341394,
"grad_norm": 0.11898776472079374,
"learning_rate": 6.418740168578208e-07,
"loss": 0.0029,
"step": 3600
},
{
"epoch": 2.5675675675675675,
"grad_norm": 0.11704775629045612,
"learning_rate": 6.217399931752627e-07,
"loss": 0.0031,
"step": 3610
},
{
"epoch": 2.5746799431009957,
"grad_norm": 0.13757018665386925,
"learning_rate": 6.019058487371687e-07,
"loss": 0.0028,
"step": 3620
},
{
"epoch": 2.581792318634424,
"grad_norm": 0.07705433560973203,
"learning_rate": 5.82372942039432e-07,
"loss": 0.0037,
"step": 3630
},
{
"epoch": 2.588904694167852,
"grad_norm": 0.12004181043862794,
"learning_rate": 5.631426109453364e-07,
"loss": 0.003,
"step": 3640
},
{
"epoch": 2.59601706970128,
"grad_norm": 0.11547199526456815,
"learning_rate": 5.44216172593916e-07,
"loss": 0.0032,
"step": 3650
},
{
"epoch": 2.6031294452347082,
"grad_norm": 0.20275686253937805,
"learning_rate": 5.255949233097451e-07,
"loss": 0.0035,
"step": 3660
},
{
"epoch": 2.6102418207681364,
"grad_norm": 0.1327960409529542,
"learning_rate": 5.072801385141429e-07,
"loss": 0.0032,
"step": 3670
},
{
"epoch": 2.617354196301565,
"grad_norm": 0.13522734646826431,
"learning_rate": 4.89273072637827e-07,
"loss": 0.0027,
"step": 3680
},
{
"epoch": 2.6244665718349927,
"grad_norm": 0.0921535098896707,
"learning_rate": 4.7157495903498105e-07,
"loss": 0.0029,
"step": 3690
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.1305724860300583,
"learning_rate": 4.541870098987911e-07,
"loss": 0.0035,
"step": 3700
},
{
"epoch": 2.6386913229018494,
"grad_norm": 0.1366897855739292,
"learning_rate": 4.371104161784073e-07,
"loss": 0.0039,
"step": 3710
},
{
"epoch": 2.6458036984352775,
"grad_norm": 0.16675061725996185,
"learning_rate": 4.2034634749738623e-07,
"loss": 0.003,
"step": 3720
},
{
"epoch": 2.6529160739687057,
"grad_norm": 0.12062320450080749,
"learning_rate": 4.038959520735658e-07,
"loss": 0.0032,
"step": 3730
},
{
"epoch": 2.660028449502134,
"grad_norm": 0.07277873243358957,
"learning_rate": 3.8776035664043033e-07,
"loss": 0.0033,
"step": 3740
},
{
"epoch": 2.667140825035562,
"grad_norm": 0.09995970754512991,
"learning_rate": 3.719406663699349e-07,
"loss": 0.0036,
"step": 3750
},
{
"epoch": 2.67425320056899,
"grad_norm": 0.14356536332083528,
"learning_rate": 3.564379647968064e-07,
"loss": 0.0034,
"step": 3760
},
{
"epoch": 2.681365576102418,
"grad_norm": 0.1289519043233803,
"learning_rate": 3.4125331374433414e-07,
"loss": 0.0029,
"step": 3770
},
{
"epoch": 2.6884779516358464,
"grad_norm": 0.10645779562131363,
"learning_rate": 3.2638775325163517e-07,
"loss": 0.0027,
"step": 3780
},
{
"epoch": 2.6955903271692745,
"grad_norm": 0.10980156190201901,
"learning_rate": 3.1184230150243025e-07,
"loss": 0.0026,
"step": 3790
},
{
"epoch": 2.7027027027027026,
"grad_norm": 0.1212601092847071,
"learning_rate": 2.9761795475529375e-07,
"loss": 0.0027,
"step": 3800
},
{
"epoch": 2.7098150782361308,
"grad_norm": 0.10465054324216685,
"learning_rate": 2.8371568727542486e-07,
"loss": 0.0032,
"step": 3810
},
{
"epoch": 2.716927453769559,
"grad_norm": 0.14087107927522052,
"learning_rate": 2.7013645126791446e-07,
"loss": 0.0027,
"step": 3820
},
{
"epoch": 2.724039829302987,
"grad_norm": 0.11777162015019617,
"learning_rate": 2.5688117681252677e-07,
"loss": 0.0031,
"step": 3830
},
{
"epoch": 2.731152204836415,
"grad_norm": 0.12580839073471906,
"learning_rate": 2.439507717999945e-07,
"loss": 0.0027,
"step": 3840
},
{
"epoch": 2.7382645803698438,
"grad_norm": 0.11019351778666993,
"learning_rate": 2.3134612186983817e-07,
"loss": 0.0032,
"step": 3850
},
{
"epoch": 2.7453769559032715,
"grad_norm": 0.2540811705778796,
"learning_rate": 2.1906809034970057e-07,
"loss": 0.0032,
"step": 3860
},
{
"epoch": 2.7524893314367,
"grad_norm": 0.14533749828341638,
"learning_rate": 2.0711751819622038e-07,
"loss": 0.0028,
"step": 3870
},
{
"epoch": 2.759601706970128,
"grad_norm": 0.17723003777910762,
"learning_rate": 1.954952239374286e-07,
"loss": 0.0033,
"step": 3880
},
{
"epoch": 2.7667140825035563,
"grad_norm": 0.1714781247080342,
"learning_rate": 1.8420200361669137e-07,
"loss": 0.0028,
"step": 3890
},
{
"epoch": 2.7738264580369845,
"grad_norm": 0.1442879683659834,
"learning_rate": 1.732386307381767e-07,
"loss": 0.0028,
"step": 3900
},
{
"epoch": 2.7809388335704126,
"grad_norm": 0.11658671113478708,
"learning_rate": 1.6260585621388604e-07,
"loss": 0.0032,
"step": 3910
},
{
"epoch": 2.7880512091038407,
"grad_norm": 0.13555304661960596,
"learning_rate": 1.523044083122138e-07,
"loss": 0.0033,
"step": 3920
},
{
"epoch": 2.795163584637269,
"grad_norm": 0.16068613052421124,
"learning_rate": 1.4233499260807194e-07,
"loss": 0.0034,
"step": 3930
},
{
"epoch": 2.802275960170697,
"grad_norm": 0.1397672323891182,
"learning_rate": 1.326982919345582e-07,
"loss": 0.003,
"step": 3940
},
{
"epoch": 2.809388335704125,
"grad_norm": 0.1228326098193467,
"learning_rate": 1.2339496633619218e-07,
"loss": 0.0026,
"step": 3950
},
{
"epoch": 2.8165007112375533,
"grad_norm": 0.09294084238773208,
"learning_rate": 1.1442565302370146e-07,
"loss": 0.0026,
"step": 3960
},
{
"epoch": 2.8236130867709814,
"grad_norm": 0.10538827214385106,
"learning_rate": 1.0579096633038411e-07,
"loss": 0.0033,
"step": 3970
},
{
"epoch": 2.8307254623044096,
"grad_norm": 0.09895208971100541,
"learning_rate": 9.749149767002197e-08,
"loss": 0.0029,
"step": 3980
},
{
"epoch": 2.8378378378378377,
"grad_norm": 0.17612347880517987,
"learning_rate": 8.952781549638412e-08,
"loss": 0.0038,
"step": 3990
},
{
"epoch": 2.844950213371266,
"grad_norm": 0.13285843764249902,
"learning_rate": 8.190046526428241e-08,
"loss": 0.0028,
"step": 4000
},
{
"epoch": 2.852062588904694,
"grad_norm": 0.15853886614347157,
"learning_rate": 7.460996939221643e-08,
"loss": 0.0032,
"step": 4010
},
{
"epoch": 2.8591749644381226,
"grad_norm": 0.10115826454451997,
"learning_rate": 6.765682722659151e-08,
"loss": 0.0034,
"step": 4020
},
{
"epoch": 2.8662873399715503,
"grad_norm": 0.16050424912282388,
"learning_rate": 6.104151500751609e-08,
"loss": 0.0026,
"step": 4030
},
{
"epoch": 2.873399715504979,
"grad_norm": 0.10822054946183253,
"learning_rate": 5.476448583618288e-08,
"loss": 0.0035,
"step": 4040
},
{
"epoch": 2.8805120910384066,
"grad_norm": 0.1113521110254991,
"learning_rate": 4.8826169643832464e-08,
"loss": 0.0026,
"step": 4050
},
{
"epoch": 2.887624466571835,
"grad_norm": 0.14081228392187445,
"learning_rate": 4.322697316231361e-08,
"loss": 0.0032,
"step": 4060
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.11756191197474342,
"learning_rate": 3.796727989621385e-08,
"loss": 0.0024,
"step": 4070
},
{
"epoch": 2.9018492176386914,
"grad_norm": 0.14346626654053973,
"learning_rate": 3.304745009660326e-08,
"loss": 0.003,
"step": 4080
},
{
"epoch": 2.9089615931721196,
"grad_norm": 0.13833583160259022,
"learning_rate": 2.8467820736350903e-08,
"loss": 0.0028,
"step": 4090
},
{
"epoch": 2.9160739687055477,
"grad_norm": 0.08441703695039304,
"learning_rate": 2.422870548705103e-08,
"loss": 0.003,
"step": 4100
},
{
"epoch": 2.923186344238976,
"grad_norm": 0.15199272572784162,
"learning_rate": 2.0330394697534726e-08,
"loss": 0.0032,
"step": 4110
},
{
"epoch": 2.930298719772404,
"grad_norm": 0.09905970954206261,
"learning_rate": 1.677315537398583e-08,
"loss": 0.0033,
"step": 4120
},
{
"epoch": 2.937411095305832,
"grad_norm": 0.12746964816800027,
"learning_rate": 1.355723116165164e-08,
"loss": 0.003,
"step": 4130
},
{
"epoch": 2.9445234708392602,
"grad_norm": 0.1730883953102828,
"learning_rate": 1.0682842328154086e-08,
"loss": 0.003,
"step": 4140
},
{
"epoch": 2.9516358463726884,
"grad_norm": 0.14592570068315344,
"learning_rate": 8.150185748405092e-09,
"loss": 0.0034,
"step": 4150
},
{
"epoch": 2.9587482219061165,
"grad_norm": 0.16218729377273186,
"learning_rate": 5.959434891121274e-09,
"loss": 0.0031,
"step": 4160
},
{
"epoch": 2.9658605974395447,
"grad_norm": 0.1534720207270455,
"learning_rate": 4.110739806940656e-09,
"loss": 0.0028,
"step": 4170
},
{
"epoch": 2.972972972972973,
"grad_norm": 0.1535652411238345,
"learning_rate": 2.604227118148117e-09,
"loss": 0.0025,
"step": 4180
},
{
"epoch": 2.9800853485064014,
"grad_norm": 0.21854345544372025,
"learning_rate": 1.4400000100017741e-09,
"loss": 0.0028,
"step": 4190
},
{
"epoch": 2.987197724039829,
"grad_norm": 0.11360018294244285,
"learning_rate": 6.181382236641887e-10,
"loss": 0.0027,
"step": 4200
},
{
"epoch": 2.9943100995732577,
"grad_norm": 0.13109703302719727,
"learning_rate": 1.3869805074284704e-10,
"loss": 0.003,
"step": 4210
},
{
"epoch": 3.0,
"step": 4218,
"total_flos": 247294279680000.0,
"train_loss": 0.26427117863254007,
"train_runtime": 27747.0354,
"train_samples_per_second": 9.727,
"train_steps_per_second": 0.152
}
],
"logging_steps": 10,
"max_steps": 4218,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 247294279680000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}